In [4]:
import argparse
import json
import os
import sys
import traceback
from glob import glob
import math
import random
import numpy as np
from dotenv import load_dotenv
from azureml.core import Dataset, Datastore, Experiment, Run, Workspace
from azureml.core.authentication import AzureCliAuthentication
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import tensorflow.keras
from tensorflow.keras.layers import Flatten, Input, concatenate, Dense, Activation, Dropout, BatchNormalization,  MaxPooling2D, AveragePooling2D, Conv2D
from tensorflow.keras.models import Model
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from azureml.core import Run
from numpy.random import seed
from tensorflow.random import set_seed
from dotnetcore2 import runtime
runtime.version = ("18", "10", "0")

2021-11-29 19:52:52.239888: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-29 19:52:52.240047: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [5]:
    #Connect to workspace
    ws = Workspace.from_config('config.json')

In [12]:

exp = Experiment(workspace=ws, name="imdb_train")

script_folder = os.path.join(os.getcwd(), "imdb_training")
os.makedirs(script_folder, exist_ok=True)

In [9]:
parser = argparse.ArgumentParser()
parser.add_argument('--data-folder', type=str, dest='data_folder', help='Test data folder mounting point')
parser.add_argument('--epochs', type=str, dest='epochs', help='Amount of epochs to train')
parser.add_argument('--batch_size', type=str, dest='batch_size', help='Batch size')
parser.add_argument('--model_name', type=str, dest='model_name', help='Model name')
args = parser.parse_args(['--data-folder','data/tmp/train_test_data','--epochs','20','--batch_size','5','--model_name','imdb-trained'])

data_folder = args.data_folder
print('Data folder:', data_folder)

dataset_train = np.load(os.path.join(data_folder, 'dataset_train.npy'))
dataset_test = np.load(os.path.join(data_folder, 'dataset_test.npy'))

run = Run.get_context()

batch_size = int(args.batch_size)
epochs = int(args.epochs)
dataset_train

Data folder: data/tmp/train_test_data


array([[5, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [3, 0, 0, ..., 0, 0, 0]])

In [None]:
# Building the autoencoder
from tensorflow.keras import metrics

autoencoder = Sequential()
#Decode
autoencoder.add(InputLayer((3952,)))
autoencoder.add(Dense(1000, activation= 'relu' ))
#Bottleneck
autoencoder.add(Dense(120, activation= 'relu' ))
#Encode
autoencoder.add(Dense(1000, activation= 'relu' ))

autoencoder.add(Dense(3952, activation= 'sigmoid' ))
autoencoder.summary()




In [None]:
# Training the autoencoder with a custom loss function

def custom_loss(y_true,y_pred):
    y_mask=tf.keras.backend.clip(y_true, 0, 0.01)*100
    return K.mean(K.square(y_mask*(y_pred - y_true)), axis=-1)

early_stopping_callback = tensorflow.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)
reduce_lr = tensorflow.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, verbose=1)

autoencoder.compile(loss=custom_loss, optimizer='adam')
autoencoder.fit(np.array(dataset_train),
                np.array(dataset_train),
                validation_split=0.2,
                epochs=epochs,
                batch_size=batch_size,
                shuffle=True,
                callbacks=[reduce_lr,early_stopping_callback])

In [None]:
test_not_null = []
pred_not_null = []
predictions = autoencoder.predict(dataset_test)
for i in range(len(dataset_train)):
    indeces = np.nonzero(np.array(dataset_train.iloc[i-1]))
    test = []
    pred = []
    test_not_null.append(np.array([dataset_train.iloc[i-1,index] for index in indeces[0]], dtype=float))
    pred_not_null.append(np.array([predictions[i-1][index] for index in indeces[0]], dtype=float))



In [None]:
THRESHOLD = 1
from sklearn.metrics import mean_squared_error
import math
means = []
for i in range(len(test_not_null)):
    means.append(np.mean(np.power(np.array(test_not_null[i]) - np.array(pred_not_null)[i],2)))
mse = np.mean(means)
print("MSE")
print(np.mean(mse))
print("STD")
variances=[]
for i in range(len(test_not_null)):
    n = len(test_not_null[i])
    mean = sum((np.array(test_not_null[i]))) / n
    deviations = [(x - mean) ** 2 for x in np.array(np.array(test_not_null[i]))]
    variance = sum(deviations) / n
    variances.append(variance)
    
std = math.sqrt(np.mean(variances))
print(std)
#Super Low std => Values don't lay far from apart => the review scores don't lay far apart


In [30]:


# to install required packages
env = Environment('imdb-training-env')
cd = CondaDependencies.create(
    pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults', 'tensorflow', 'scikit-learn','pandas'],
    )

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = ws)

{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20211029.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "imdb-training-env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-for

In [31]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "imdb-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_NC6")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes,
                                                                identity_type="SystemAssigned")

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
print(compute_target.get_status().serialize())

found compute target: imdb-cluster
{'currentNodeCount': 1, 'targetNodeCount': 1, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 1, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-11-29T18:58:06.140000+00:00', 'errors': None, 'creationTime': '2021-11-29T15:25:24.425870+00:00', 'modifiedTime': '2021-11-29T16:24:49.261775+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT1800S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


In [54]:
from azureml.core import ScriptRunConfig
model_name = "imdb_model"
train_test_dataset = Dataset.get_by_name(ws, name='imdb_train_test')
args = ['--data-folder', train_test_dataset.as_mount(), '--epochs', epochs, '--batch_size', batch_size, '--model_name', model_name]

src = ScriptRunConfig(source_directory="../steps/root/scripts",
                      script='train.py', 
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)

In [55]:
run = exp.submit(config=src)
run

Experiment,Id,Type,Status,Details Page,Docs Page
imdb_train,imdb_train_1638215255_2591be2b,azureml.scriptrun,Preparing,Link to Azure Machine Learning studio,Link to Documentation


In [59]:
import json

run.wait_for_completion()

run_details = {k:v for k,v in run.get_details().items() if k not in ['inputDatasets', 'outputDatasets']}


FileNotFoundError: [Errno 2] No such file or directory: 'states/training-run.json'

In [61]:
filename = "states/training-run.json"

if not os.path.exists(os.path.dirname(filename)):
    try:
        os.makedirs(os.path.dirname(filename))
    except OSError as exc: # Guard against race condition
        raise


with open('filename', 'w') as training_run_json:
    json.dump(run_details, training_run_json)

In [62]:
model = run.register_model(model_name="imdb_model", model_path=f'outputs/{model_name}')
print(model.name, model.id, model.version, sep='\t')
model_json = {}
model_json["model"] = model.serialize()
model_json["run"] = run_details

with open('states/model_details.json', 'w') as model_details:
    json.dump(model_json, model_details)

imdb_model	imdb_model:1	1
