In [2]:
import azureml.core

In [3]:
print(azureml.core.VERSION)

1.27.0


# Create Workspace

In [5]:
from azureml.core import Workspace
from azureml.core.authentication import InteractiveLoginAuthentication

sid = '6e05a726-0769-4103-aed1-e2ee3a907a5a'
tenant_id = '0f942ca0-ebef-4f26-80f8-f501d599ba90'

forced_interactive_auth = InteractiveLoginAuthentication(tenant_id=tenant_id, force=True)

ws = Workspace.create(name='azureml_workspace2',
            subscription_id= sid, 
            resource_group='rgazureml',
            create_resource_group = True,
            location='centralus'
            )

Note, we have launched a browser for you to login. For old experience with device code, use "az login --use-device-code"


Performing interactive authentication. Please follow the instructions on the terminal.
You have logged in. Now let us find all the subscriptions to which you have access...
Interactive authentication successfully completed.
Deploying StorageAccount with name azuremlwstorage84e2b186e.
Deploying AppInsights with name azuremlwinsightscb75c22b.
Deployed AppInsights with name azuremlwinsightscb75c22b. Took 8.52 seconds.
Deploying KeyVault with name azuremlwkeyvault0fd0acb0.
Deployed KeyVault with name azuremlwkeyvault0fd0acb0. Took 25.55 seconds.
Deployed StorageAccount with name azuremlwstorage84e2b186e. Took 34.13 seconds.
Deploying Workspace with name azureml_workspace2.
Deployed Workspace with name azureml_workspace2. Took 34.31 seconds.


# Upload to Default Data Store

In [6]:
#upload data by using get_default_datastore()
ds = ws.get_default_datastore()
ds.upload(src_dir='./recodata', target_path='recodata', overwrite=True, show_progress=True)

print('Done')

Uploading an estimated of 3 files
Uploading ./recodata\professionals.csv
Uploaded ./recodata\professionals.csv, 1 files out of an estimated total of 3
Uploading ./recodata\questions.csv
Uploaded ./recodata\questions.csv, 2 files out of an estimated total of 3
Uploading ./recodata\answers.csv
Uploaded ./recodata\answers.csv, 3 files out of an estimated total of 3
Uploaded 3 files
Done


# Create the Training Folder

In [7]:
import os

# create the folder
folder_training_script = './recocode'
os.makedirs(folder_training_script, exist_ok=True)

print('Done')

Done


# Create the Compute Cluster

In [8]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# Step 1: name the cluster and set the minimal and maximal number of nodes 
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpucluster")
min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 1)

# Step 2: choose environment variables 
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")

provisioning_config = AmlCompute.provisioning_configuration(
    vm_size = vm_size, min_nodes = min_nodes, max_nodes = max_nodes)

# create the cluster
compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

print('Compute target created')

Compute target created


# Create the Training Script

In [29]:
%%writefile $folder_training_script/train.py

import argparse
import os
import numpy as np
import pandas as pd
import glob
import gc
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string
import pickle
import spacy
from spacy.cli.download import download as spacy_download

from azureml.core import Run
# from utils import load_data

parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
args = parser.parse_args()

def clean_text(text):
    '''Make text lowercase,remove punctuation
    .'''
    text = str(text).lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    return text


data_folder = os.path.join(args.data_folder, 'recodata')
print('Data folder:', data_folder)

questions  = pd.read_csv(os.path.join(data_folder, 'questions.csv'))
professionals = pd.read_csv(os.path.join(data_folder, 'professionals.csv'))
answers = pd.read_csv(os.path.join(data_folder, 'answers.csv'))

prof_ans = pd.merge(professionals, answers, how = 'left' ,
                    left_on = 'professionals_id', right_on = 'answers_author_id')
prof_ans_q = pd.merge(prof_ans, questions, how = 'left' ,
                      left_on = 'answers_question_id', right_on = 'questions_id')

prof_ans_q = prof_ans_q[(~prof_ans_q["questions_title"].isna()) | (~prof_ans_q["questions_body"].isna()) ]

q = prof_ans_q["questions_title"] + " " + prof_ans_q["questions_body"]
q  = q.apply(lambda x:clean_text(x))

spacy_download('en')
spacy_download('en_core_web_lg')
nlp = spacy.load('en_core_web_lg')
spacy_questions = [nlp(q1).vector for q1 in q ]
pickle.dump(spacy_questions,open("outputs/spacy_questions.pkl","wb"))

# Get the experiment run context
run = Run.get_context()


run.complete()

Overwriting ./recocode/train.py


# Create the Environment

In [30]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

# Create a Python environment for the experiment
wine_env = Environment("reco-env")
wine_env.python.user_managed_dependencies = False # Let Azure ML manage dependencies
wine_env.docker.enabled = False # Use a docker container

# Create a set of package dependencies (conda or pip as required)
wine_packages = CondaDependencies.create(conda_packages = ['scikit-learn','spacy'])

# Add the dependencies to the environment
wine_env.python.conda_dependencies = wine_packages

print(wine_env.name, 'defined.')



reco-env defined.


# Register the environment in the workspace

In [31]:
# Register the environment
wine_env.register(workspace=ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/intelmpi2018.3-ubuntu16.04:20210301.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "reco-env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"
   

# Create the Estimator

In [32]:
from azureml.train.estimator import Estimator

script_params = {
    '--data-folder': ds.as_mount()
}

registered_env = Environment.get(ws, 'reco-env')

# Create an estimator
estimator = Estimator(source_directory=folder_training_script,
                      script_params=script_params,
                      compute_target = compute_target, # Run the experiment on the remote compute target
                      environment_definition = registered_env,
                      entry_script='train.py')



# Create the Experiment

In [33]:
from azureml.core import Experiment

#Create an experiment
experiment = Experiment(workspace = ws, name = "reco_expt")

print('Experiment created')

Experiment created


# Submit the Experiment with the Estimator information

In [34]:
run = experiment.submit(config=estimator)
run



Experiment,Id,Type,Status,Details Page,Docs Page
reco_expt,reco_expt_1620842844_99e76fd4,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [35]:
#get the result
print(run.get_metrics())

{}


# Register the model

In [36]:
spacy_questions = run.register_model(model_name='spacy_questions_model',
                           model_path='outputs/spacy_questions.pkl',
                           tags = {'area': "spacy_questions", 'type': "spacy"},
                           description = "spacy_questions")

print(spacy_questions.name)

spacy_questions_model


# Inference Section

## Create the Inference Script

In [80]:
%%writefile $folder_training_script/score.py
import json
import joblib
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import spacy
from spacy.cli.download import download as spacy_download

from azureml.core.model import Model

# Called when the service is loaded
def init():
    global model_path
    global nlp
    
    # Get the path to the registered model file and load it
    model_path = Model.get_model_path('spacy_questions_model')
    spacy_download('en')
    spacy_download('en_core_web_lg')
    nlp = spacy.load('en_core_web_lg')
    

# Called when a request is received
def run(raw_data):
    # Get the input data as a numpy array
    data = (json.loads(raw_data)['data'])
    

    with open(model_path, 'rb') as f:
        spacy_questions = pickle.load(f)
    
    q_new = [nlp(data).vector]
    result = cosine_similarity(q_new,spacy_questions)
    predictions = result
    
    return predictions.tolist()

Overwriting ./recocode/score.py


#  Create the Dependencies for the Inference Script

In [81]:
from azureml.core.conda_dependencies import CondaDependencies

# Add the dependencies for your model
myenv = CondaDependencies()
myenv.add_conda_package("scikit-learn")
myenv.add_conda_package("spacy")

# Save the environment config as a .yml file
env_file = './recocode/env.yml'
with open(env_file,"w") as f:
    f.write(myenv.serialize_to_string())
print("Saved dependency info in", env_file)

Saved dependency info in ./recocode/env.yml


# Create an Inference Configuration

In [82]:
from azureml.core.model import InferenceConfig

classifier_inference_config = InferenceConfig(runtime= "python",
                                              source_directory = './recocode',
                                              entry_script="score.py",
                                              conda_file="env.yml")

# Create the Inference Cluster

In [40]:
from azureml.core.compute import ComputeTarget, AksCompute

cluster_name = 'aks-cluster'
compute_config = AksCompute.provisioning_configuration(cluster_purpose = AksCompute.ClusterPurpose.DEV_TEST)
production_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
production_cluster.wait_for_completion(show_output=True)

Creating......................................................
SucceededProvisioning operation finished, operation "Succeeded"


In [42]:
from azureml.core.webservice import AksWebservice

classifier_deploy_config = AksWebservice.deploy_configuration(cpu_cores = 1,
                                                              memory_gb = 1)

# Deploy the Model to the Inference Cluster

In [83]:
from azureml.core.model import Model

model1 = ws.models['spacy_questions_model']
service = Model.deploy(workspace=ws,
                       name = 'reco-service3',
                       models = [model1],
                       inference_config = classifier_inference_config,
                       deployment_config = classifier_deploy_config,
                       deployment_target = production_cluster)
service.wait_for_deployment(show_output = True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-05-13 00:59:06+05:30 Creating Container Registry if not exists.
2021-05-13 00:59:06+05:30 Registering the environment.
2021-05-13 00:59:08+05:30 Use the existing image.
2021-05-13 00:59:10+05:30 Creating resources in AKS.
2021-05-13 00:59:10+05:30 Submitting deployment to compute.
2021-05-13 00:59:12+05:30 Checking the status of deployment reco-service3..
2021-05-13 01:01:12+05:30 Checking the status of inference endpoint reco-service3.
Succeeded
AKS service creation operation finished, operation "Succeeded"


In [84]:
service.update(enable_app_insights=True)

# Perform predictions with the Inference Cluster

In [85]:
endpoint = service.scoring_uri
print(endpoint)

http://20.80.120.179:80/api/v1/service/reco-service3/score


In [86]:
primary_key, secondary_key = service.get_keys()

In [87]:
primary_key

'Ai4ob6rfQM4YCgZKzl4DL80EVcfYsL9J'

In [95]:
import requests
import json

# An array of new data cases
x_new = "I want to be a data scientist. What should I study"

# Convert the array to a serializable list in a JSON document
json_data = json.dumps({"data": x_new})

# Set the content type in the request headers
request_headers = { "Content-Type":"application/json",
                    "Authorization":"Bearer " + primary_key }

# Call the service
response = requests.post(url = endpoint,
                         data = json_data,
                         headers = request_headers)

print(response)

<Response [200]>


In [96]:
result = response.json()

In [97]:
import numpy as np
import pandas as pd

data_folder = "recodata"

questions  = pd.read_csv(os.path.join(data_folder, 'questions.csv'))
professionals = pd.read_csv(os.path.join(data_folder, 'professionals.csv'))
answers = pd.read_csv(os.path.join(data_folder, 'answers.csv'))

prof_ans = pd.merge(professionals, answers, how = 'left' ,
                    left_on = 'professionals_id', right_on = 'answers_author_id')
prof_ans_q = pd.merge(prof_ans, questions, how = 'left' ,
                      left_on = 'answers_question_id', right_on = 'questions_id')

prof_ans_q = prof_ans_q[(~prof_ans_q["questions_title"].isna()) | (~prof_ans_q["questions_body"].isna()) ]

index = np.argsort(result)[:,-10:]
print(index[0])

[24223 30993 16339 30968 47722 33136 16221 18047 48865 49181]


# Best answers

In [98]:
prof_ans_q.iloc[index[0][-1]]["answers_body"]

'<p>You should search for Algorithm videos. Usually when studying data, you would need to know about databases structure, analytics skills, and some other logics. Another thing you could do would be start analyzing some small real cases like how long does it take to go from your house to the supermarket and what you could do to reduce the time? or how often do you drink water (time gap between each occurence). How could you track that? and how could you improve it? is it good?</p><p>these are a few examples on how you could analyze stuff.</p>'

In [99]:
prof_ans_q.iloc[index[0][-2]]["answers_body"]

'<p> </p><p>Hello Chong G.</p><p> </p><p>I am not a data scientist, but I think I can give you some advice on this. Nowadays, an increasing number of professions are requiring analytics capabilities.</p><p> </p><p> </p><p>There are some core things you should learn to handle great amount of data, like:</p><p>&nbsp;</p><p>Relational Database concepts;</p><p>SQL - Computer language for creating and managing databases;</p><p>Excel;</p><p>Programing languages such as C, VBA, R...&nbsp;</p><p><br></p><p>You should also consider learning how to display the data in an organized way and Power BI / Think-Cell are great for that</p><p>  </p><p>There are several tutorials around the internet about those topics and also focused courses. I personally recommend the latter, because it is easier to progress through the topics.</p><p><br></p><p>Hope my advice was helpful to you!</p><p> </p>'

In [100]:
prof_ans_q.iloc[index[0][-3]]["answers_body"]

'<p>It is wonderful that you want to learn software coding and become a software engineer. This is  certainly a field that has and will continue to have lots of exciting prospects for learning and growth. One of the ways to become a software engineer is to get a degree in Computer Science or Information technology or computer applications. For applying to a college to study these disciplines one must first of all be interested in and study Maths as well as Physics and Chemistry (so called PCM subjects) till 12th standard. Meanwhile, in many schools, computers is taught as a subject in 9th, 10th, 11th and 12th standards. As part of the computer subject you will learn about programming and learn the basics of coding in a language such as Java. If your school/PUC college does not offer computers as a subject then you can enroll for a programming class outside school or if you have access to a computer you can learn programming on your own. Based on the above if you can give me more detail