In [68]:
import azureml.core
from azureml.core import Workspace

ws = Workspace.get(name="MachineLearning",
               subscription_id='41e7097b-b33b-4dad-ba38-6a953141427e',
               resource_group='MachineLearning')

print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.0.72 to work with MachineLearning


In [69]:
import os
from azureml.core import Datastore

if 'mldatastore'not in ws.datastores:
    blob_datastore_name='mldatastore' # Name of the datastore to workspace
    container_name=os.getenv("BLOB_CONTAINER", "mldata") # Name of Azure blob container
    account_name=os.getenv("BLOB_ACCOUNTNAME", "alexmlstorage") # Storage account name
    account_key=os.getenv("BLOB_ACCOUNT_KEY", "HNHyr3lkKhNfcAMDGuv9K3TvuysoOKjlkzP2dKGFJc89sKixfNxix9uQp9w4HtYIOV0ziZLHy2asI+zxkesUcg==") # Storage account key
    
    try:
        blob_ds = Datastore.register_azure_blob_container(workspace=ws, 
                                                                 datastore_name=blob_datastore_name, 
                                                                 container_name=container_name, 
                                                                 account_name=account_name,
                                                                 account_key=account_key)
        print('Datastore registered.')
    except Exception as ex:
        print(ex)
else:
    blob_ds = Datastore.get(ws, datastore_name='mldatastore')
    print('Datastore already registered.')

Datastore already registered.


In [70]:
from azureml.core import Dataset

if 'document_files' not in ws.datasets:
    #Create a file dataset from the path on the datastore.
    document_ds = Dataset.File.from_files(path=(blob_ds, ''))

    # Register the file dataset
    try:
        document_ds = document_ds.register(workspace=ws, 
                                            name='document_files',
                                            description='documents',
                                            create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    document_ds = Dataset.get_by_name(ws, 'document_files')
    print('Dataset already registered.')

Dataset already registered.


In [9]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = 'aml-cluster'

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2', max_nodes=4)
    training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)

training_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


In [75]:
experiment_folder = 'experiment'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder, 'folder created')

experiment folder created


In [79]:
%%writefile $experiment_folder/train.py

import argparse
import numpy as np
import pandas as pd
import re
import pickle
import tika
import glob
from keras.layers import  Dropout, Dense
from keras.models import Sequential
from keras.utils import np_utils
from keras.models import load_model
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tika import parser as tika_parser
from azureml.core import Run
from azureml.core import Experiment
from azureml.core import Model

parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='data folder mounting point')
args = parser.parse_args()

data_folder = args.data_folder
print('Data folder:', data_folder)

run = Run.get_context()

print('Loading Data...')
agenda_paths = glob.glob(data_folder + "**/Agendas/*")
medicalrecord_paths = glob.glob(data_folder + "**/MedicalRecords/*")
paper_paths = glob.glob(data_folder + "**/Papers/*")
resume_paths = glob.glob(data_folder + "**/Resumes/*")

def preprocess_text(text):
    processed = re.sub('[^a-zA-Z]', ' ', text)
    processed = re.sub(r"\s+[a-zA-Z]\s+", ' ', processed)
    processed = re.sub(r'\s+', ' ', processed)

    return processed

def process_raw_data(paths, label):
    data = {
        'Content': [],
        'Type': [label] * len(paths)
    }
    
    for path in paths:
        print('Processing {}'.format(path))
        parsed = tika_parser.from_file(path)
        text = preprocess_text(parsed["content"])
        data['Content'].append(text)
    
    return pd.DataFrame(data, columns = ['Content', 'Type'])

def tfidf(X_train, X_test, num_words=5000):

    vectorizer_x = TfidfVectorizer(max_features=num_words)
    X_train = vectorizer_x.fit_transform(X_train).toarray()
    X_test = vectorizer_x.transform(X_test).toarray()

    vectoriser_file = 'vectoriser.pkl'
    pickle.dump(vectorizer_x, open(vectoriser_file,'wb'))
    run.upload_file(name = 'models/' + vectoriser_file, path_or_stream = './' + vectoriser_file)
    
    print("tf-idf with", str(np.array(X_train).shape[1]), "features")

    return (X_train,X_test)

def build_DNN_model(shape, num_classes, dropout=0.2):

    model = Sequential()
    node = 512 # number of nodes
    num_layers = 4 # number of  hidden layer
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))

    for i in range(0, num_layers):
        model.add(Dense(node,input_dim=node,activation='relu'))
        model.add(Dropout(dropout))

    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

agenda_df = process_raw_data(agenda_paths, 'agenda')
medicalrecord_df = process_raw_data(medicalrecord_paths, 'medicalrecord')
paper_df = process_raw_data(paper_paths, 'paper')
resume_df = process_raw_data(resume_paths, 'resume')

df = pd.concat([agenda_df, medicalrecord_df, paper_df, resume_df], axis=0)

X = df['Content'].values
y = df['Type'].values

encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
dummy_y = np_utils.to_categorical(encoded_y)

encoder_file = 'encoder.pkl'
pickle.dump(encoder, open(encoder_file,'wb'))
run.upload_file(name = 'models/' + encoder_file, path_or_stream = './' + encoder_file)

X_train, X_test, y_train, y_test = train_test_split(X, dummy_y, test_size=0.20, random_state=7)
X_train_tfidf, X_test_tfidf = tfidf(X_train, X_test)

model = build_DNN_model(X_train_tfidf.shape[1], 4)

print(model.summary())

history = model.fit(X_train_tfidf, y_train, validation_split=0.2, epochs=5, batch_size=128, verbose=1)
score = model.evaluate(X_test_tfidf, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

run.log('Score', np.float(score[0]))
run.log('Accuracy', np.float(score[1]))

dnn_file = 'dnn.h5'
model.save(dnn_file) 
run.upload_file(name = 'models/' + dnn_file, path_or_stream = './' + dnn_file)

run.complete()

run.register_model(model_path='./models', 
                   model_name='document_classification_model',
                   description='Document classification model',
                   properties={'Accuracy': run.get_metrics()['Accuracy']})

print('Model trained and registered.')

Overwriting experiment/train.py


In [80]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

env = Environment('document-classification')

env.docker.enabled = True
env.docker.base_image = "intelmpi:v1"
env.docker.base_image_registry.address = "machinelearndcc81c93.azurecr.io"
env.docker.base_image_registry.username = "machinelearndcc81c93"
env.docker.base_image_registry.password = "IczI34/hZD+M3QAw2cDyHzR7iJRMEdAj"

# to install required packages
conda_packages = ['numpy', 'scikit-learn']
pip_packages = ['azureml-dataprep[pandas,fuse]', 'azureml-defaults', 'tensorflow', 'keras', 'tika']

cd = CondaDependencies.create(pip_packages=pip_packages, conda_packages=conda_packages)
env.python.conda_dependencies = cd
    
# Register environment to re-use later
env.register(workspace = ws)
print('Environment registered.')

Environment registered.


In [81]:
from azureml.train.estimator import Estimator
from azureml.core import Experiment
from azureml.widgets import RunDetails

# Set the script parameters
script_params = {
    '--data-folder': document_ds.as_named_input('documents').as_mount()
}

# Get the training dataset
diabetes_ds = ws.datasets.get("document_files")

# Create an estimator
estimator = Estimator(source_directory=experiment_folder,
                       entry_script='train.py',
                       script_params=script_params,
                       compute_target=cluster_name,
                       environment_definition=env
                      )

# Create an experiment
experiment_name = 'documentclassification-training'
experiment = Experiment(workspace = ws, name = experiment_name)

# Run the experiment
run = experiment.submit(config=estimator)

# Show the run details while running
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'documentclassification-training_1586586326_a07bcf03',
 'target': 'aml-cluster',
 'status': 'Finalizing',
 'startTimeUtc': '2020-04-11T06:29:48.994886Z',
 'properties': {'_azureml.ComputeTargetType': 'amlcompute',
  'ContentSnapshotId': '1b8ba5d4-b1c0-49eb-8ff0-832bccc973ce',
  'AzureML.DerivedImageName': 'azureml/azureml_13204f7f39d612b4bfd4cf1d9fa89800',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [{'dataset': {'id': '0d2fdb57-6235-4f6a-9c07-32593b9df03d'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'documents', 'mechanism': 'Mount'}}],
 'runDefinition': {'script': 'train.py',
  'useAbsolutePath': False,
  'arguments': ['--data-folder', 'DatasetConsumptionConfig:documents'],
  'sourceDirectoryDataStore': None,
  'framework': 'Python',
  'communicator': 'None',
  'target': 'aml-cluster',
  'dataReferences': {},
  'data': {'documents': {'dataLocation': {'dataset': {'id': '0d2

In [85]:
import os

service_folder = 'service'

# Create a folder for the web service files
experiment_folder = './' + service_folder
os.makedirs(service_folder, exist_ok=True)

print(service_folder, 'folder created.')

service folder created.


In [112]:
%%writefile $service_folder/score.py
import json
import pickle
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.models import load_model
from azureml.core.model import Model

# Called when the service is loaded
def init():
    global model
    global vectoriser
    global encoder
    
    # Get the path where the deployed model can be found.
    model_path = Model.get_model_path('document_classification_model')
    
    # load models
    model = load_model(model_path + '/dnn.h5')

    with open(model_path + '/vectoriser.pkl','rb') as handle:
        vectoriser = pickle.load(handle)

    with open(model_path + '/encoder.pkl','rb') as handle:
        encoder = pickle.load(handle)

# Called when a request is received
def run(data):
    try:
        # Pick out the text property of the JSON request.
        # This expects a request in the form of {"text": "some text to classify"}
        data = json.loads(data)
        text = preprocess_text(data['text'])
        vector = vectoriser.transform([text]).toarray()
        prediction = model.predict([vector])
        prediction = np.round(prediction[0])
        prediction = encoder.inverse_transform(np.where(prediction == 1))
        return prediction[0]
    except Exception as e:
        error = str(e)
        return error

def preprocess_text(text):
    processed = re.sub('[^a-zA-Z]', ' ', text)
    processed = re.sub(r"\s+[a-zA-Z]\s+", ' ', processed)
    processed = re.sub(r'\s+', ' ', processed)

    return processed

Overwriting service/score.py


In [113]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

# Create the environment
deployment_env = Environment('document-classification-deployment')

# to install required packages
conda_packages = ['numpy', 'scikit-learn']
pip_packages = ['azureml-dataprep[pandas,fuse]', 'azureml-defaults', 'tensorflow', 'keras']

conda_dep = CondaDependencies.create(pip_packages=pip_packages, conda_packages=conda_packages)

# Adds dependencies to PythonSection of myenv
deployment_env.python.conda_dependencies=conda_dep

In [115]:
from azureml.core import Model
from azureml.core.webservice import AciWebservice
from azureml.core.model import InferenceConfig

model = ws.models['document_classification_model']

inference_config = InferenceConfig(source_directory=service_folder, 
                                   entry_script='score.py',
                                   environment=deployment_env)

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

service_name = "document-classification-service"

service = Model.deploy(ws, service_name, [model], inference_config, deployment_config)

service.wait_for_deployment(True)
print(service.state)

Running....................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [118]:
import requests
import json

scoring_uri = 'http://1400a94a-0b41-4f9a-955a-6154add77094.westus.azurecontainer.io/score'
headers = {'Content-Type':'application/json'}

test_data = json.dumps({'text': 'AMY PROFILE Fund accountant with nearly 2 years of experience in hedge fund administration, which includes preparation of NAV calculations, financial statements and associated reports. Consistently meeting deadlines while ensuring a high quality of work standards. Fast learner, driven for results and analytical in problem solving. WORK EXPERIENCE Citco Fund Services (Singapore) Pte Ltd  Jan 2016 – Present Fund Accountant Calculation of estimate and final NAVs on a daily, weekly and monthly basis Preparation of cash and position reconciliation reports  Daily pre-production tasks such as price checks, interest accruals and fees booking  Maintaining day-to-day relationships with investment managers, brokers and auditors  Communicating with the reconciliations team to ensure consistent and high-quality standards when delivering NAV packages  Undertaking fund migrations from Citco Toronto and Citco Dublin Office  Familiar with various pricing valuation models e.g. Independent Price Verification, External Valuer  Investigation and resolution of breaks  Training of new fund accountants in the team  Assist in reviewing estimate and final NAV packs Norgas Carriers Private Limited Nov 2014 – Apr 2015 Accountant  Processing invoices from suppliers and scheduling payment remittances  Assisted in collecting outstanding debts from debtors  Involved in the year end closing of accounts  Liaised with external auditors during interim and final audit  Monthly bank reconciliation  Perform intercompany journal entries and reconciliation Singapore Armed Forces Jul 2013 – July 2014 Operation Specialist/ NSF Battery Sergeant Major (2nd Sergeant, 24th Battalion Singapore Artillery)  In charge of the training management and administration matters, ensuring smooth running and operation of the battalion  Responsible for the welfare, discipline and regimentation of 60 fellow NSFs  Rallied with superiors to keep a lookout for troubled soldiers, acted as the first level of feedback for peers EDUCATION AND QUALIFICATIONS Royal Melbourne Institute of Technology (RMIT) Jul 2014 – Oct 2015 Bachelor of Business (Accountancy) GPA: 3.1 / 4 Temasek Polytechnic Apr 2009 – Apr 2012 Diploma in Accounting and Finance Ngee Ann Secondary School Jan 2005 – Nov 2008 GCE ‘O’ Levels CertificateSKILLS AND COMPETENCIES  Trained in accounting softwares, Aexeo, Agresso Business World, Sage AccPac and MYOB  Proficient in Microsoft Excel, PowerPoint and Word  Competent in Financial and Economic Databases (Bloomberg, Thomson Reuters)  Languages spoken: English, Chinese, Cantonese  Languages written: English, Chinese'})

response = requests.post(scoring_uri, data=test_data, headers=headers)

print(response.status_code)
print(response.elapsed)
print(response.json())

200
0:00:00.097396
resume
