# Create a workspace

In [20]:
import azureml.core

print(azureml.core.VERSION)

from azureml.core import Workspace

ws = Workspace.create(name='GoGreenworkspace',
            subscription_id='2bf014a6-b217-4034-a315-cb95042c9087', 
            resource_group='rgGoGreenworkspace',
            create_resource_group = True,
            location='North Europe'
            )

print('AMLS Workspace created')

1.0.72
Deploying KeyVault with name gogreenwkeyvault6cf17edb.
Deploying StorageAccount with name gogreenwstorage8897f03af.
Deploying AppInsights with name gogreenwinsights0b3ecb09.
Deployed AppInsights with name gogreenwinsights0b3ecb09. Took 9.45 seconds.
Deployed KeyVault with name gogreenwkeyvault6cf17edb. Took 21.4 seconds.
Deployed StorageAccount with name gogreenwstorage8897f03af. Took 27.56 seconds.
Deploying Workspace with name GoGreenworkspace.
Deployed Workspace with name GoGreenworkspace. Took 135.54 seconds.
AMLS Workspace created


# Create the configuration file

In [51]:
ws.write_config(path="./config", file_name="ws_config.json")
print('Configuration saved')

Configuration saved


# Create a remote compute target

In [22]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# Step 1: name the cluster and set the minimal and maximal number of nodes 
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpucluster")
min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 3)

# Step 2: choose environment variables 
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")

provisioning_config = AmlCompute.provisioning_configuration(
    vm_size = vm_size, min_nodes = min_nodes, max_nodes = max_nodes)

# create the cluster
compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

print('Compute target created')

Compute target created


# Training scripts

In [30]:
import os

# create the folder of the training Python scripts
folder_training_script = './training_scripts'
os.makedirs(folder_training_script, exist_ok=True)

print('Done')

Done


## Support Vector Machine

In [43]:
%%writefile $folder_training_script/train_SVC.py

import time, datetime
from contextlib import contextmanager
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.svm import SVC


@contextmanager
def measure_time(label):
    """
    Context manager to measure time of computation.
    """
    start = time.time()
    yield
    end = time.time()
    print('Duration of [{}]: {}'.format(label, datetime.timedelta(seconds=end-start)))


def load_data(path, to_split=True):
    """
    Load the csv file and returns (X,y).
    """
    # Read the csv file
    df = pd.read_csv(path, header=0, index_col=0)

    # Get the output values
    y = df['crowd_class'].values.squeeze()

    # Get the input values
    feature = 'cluster'
    X = df[feature].values.squeeze()
    X = X.reshape(-1, 1)
    
    # Split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if to_split:
        return X_train, X_test, y_train, y_test
    else:
        return X, y
    
    
def train(path, to_split=True):
    """
    Train the model.
    """
    filename = "../models/svc.pkl"
    
    # Load the training (and testing) set(s)
    if to_split:
        X_train, X_test, y_train, y_test = load_data(path, to_split=to_split)
    else:
        X_train, y_train = load_data(path, to_split=to_split)

    with measure_time('Training...'):
        model = SVC(kernel='rbf', max_iter=100000)
        model.fit(X_train, y_train)
        joblib.dump(model, filename) 
        
    y_pred = model.predict(X_train)
    print("=================================================================")
    print("SVM Training set accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print("=================================================================")
    
    if to_split:
        y_pred = model.predict(X_test)
        print("SVM Test set accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("=================================================================")


# Train our model
path = './data/cleaned_HSL_data.csv'
train(path)

Overwriting ./training_scripts/train_SVC.py


## Logistic Regression

In [44]:
%%writefile $folder_training_script/train_logreg.py

import time, datetime
from contextlib import contextmanager
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression


@contextmanager
def measure_time(label):
    """
    Context manager to measure time of computation.
    """
    start = time.time()
    yield
    end = time.time()
    print('Duration of [{}]: {}'.format(label, datetime.timedelta(seconds=end-start)))


def load_data(path, to_split=True):
    """
    Load the csv file and returns (X,y).
    """
    # Read the csv file
    df = pd.read_csv(path, header=0, index_col=0)

    # Get the output values
    y = df['crowd_class'].values.squeeze()

    # Get the input values
    feature = 'cluster'
    X = df[feature].values.squeeze()
    X = X.reshape(-1, 1)
    
    # Split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if to_split:
        return X_train, X_test, y_train, y_test
    else:
        return X, y
    
    
def train(path, to_split=True):
    """
    Train the model.
    """
    filename = "../models/lin_reg.pkl"
    
    # Load the training (and testing) set(s)
    if to_split:
        X_train, X_test, y_train, y_test = load_data(path, to_split=to_split)
    else:
        X_train, y_train = load_data(path, to_split=to_split)

    with measure_time('Training...'):
        model = LogisticRegression(random_state=42)
        model.fit(X_train, y_train)
        joblib.dump(model, filename) 
        
    y_pred = model.predict(X_train)
    print("=================================================================")
    print("Logistic Regression Training set accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print("=================================================================")
    
    
    if to_split:
        y_pred = model.predict(X_test)
        print("Logistic Regression Test set accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("=================================================================")


# Train our model
path = './data/cleaned_HSL_data.csv'
train(path)

Overwriting ./training_scripts/train_logreg.py


## K-Nearest Neighbors

In [45]:
%%writefile $folder_training_script/train_knn.py

import time, datetime
from contextlib import contextmanager
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.neighbors import KNeighborsClassifier


@contextmanager
def measure_time(label):
    """
    Context manager to measure time of computation.
    """
    start = time.time()
    yield
    end = time.time()
    print('Duration of [{}]: {}'.format(label, datetime.timedelta(seconds=end-start)))


def load_data(path, to_split=True):
    """
    Load the csv file and returns (X,y).
    """
    # Read the csv file
    df = pd.read_csv(path, header=0, index_col=0)

    # Get the output values
    y = df['crowd_class'].values.squeeze()

    # Get the input values
    feature = 'cluster'
    X = df[feature].values.squeeze()
    X = X.reshape(-1, 1)
    
    # Split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if to_split:
        return X_train, X_test, y_train, y_test
    else:
        return X, y
    
    
def train(path, to_split=True):
    """
    Train the model.
    """
    filename = "../models/knn.pkl"
    
    # Load the training (and testing) set(s)
    if to_split:
        X_train, X_test, y_train, y_test = load_data(path, to_split=to_split)
    else:
        X_train, y_train = load_data(path, to_split=to_split)

    with measure_time('Training...'):
        model = KNeighborsClassifier(n_neighbors=100)
        model.fit(X_train, y_train)
        joblib.dump(model, filename) 
        
    y_pred = model.predict(X_train)
    print("=================================================================")
    print("Knn Training set accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print("=================================================================")
    
    if to_split:
        y_pred = model.predict(X_test)
        print("Knn Test set accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("=================================================================")


# Train our model
path = './data/cleaned_HSL_data.csv'
train(path)

Writing ./training_scripts/train_knn.py


## Multi-Layer Perceptron

In [46]:
%%writefile $folder_training_script/train_mlp.py

import time, datetime
from contextlib import contextmanager
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.externals import joblib
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV


@contextmanager
def measure_time(label):
    """
    Context manager to measure time of computation.
    """
    start = time.time()
    yield
    end = time.time()
    print('Duration of [{}]: {}'.format(label, datetime.timedelta(seconds=end-start)))


def load_data(path, to_split=True):
    """
    Load the csv file and returns (X,y).
    """
    # Read the csv file
    df = pd.read_csv(path, header=0, index_col=0)

    # Get the output values
    y = df['crowd_class'].values.squeeze()

    # Get the input values
    feature = 'cluster'
    X = df[feature].values.squeeze()
    X = X.reshape(-1, 1)
    
    # Split train and test data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    if to_split:
        return X_train, X_test, y_train, y_test
    else:
        return X, y
    
    
def tune_hyperparameter(path):
    """
    Get the best hyperparameters.
    """
   # Load the training set
    X, y = load_data(path, to_split=False)
        
    # Create the random grid
    random_grid = {'hidden_layer_sizes': [(20,), (50,), (100,), (150,)],
                    'activation': ['tanh', 'relu', 'logistic', 'identity'],
                    'learning_rate_init': [0.005, 0.01, 0.05, 0.1, 0.2, 0.3],
                    'learning_rate': ['constant','adaptive'],
                    'momentum': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}
    
    # Use the random grid to search for best hyperparameters
    # First create the base model to tune
    mlp = MLPClassifier(solver='sgd', early_stopping=True)
    # Random search of parameters, using 5 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    mlp_random = RandomizedSearchCV(estimator = mlp,
                                   param_distributions = random_grid,
                                   n_iter = 100,
                                   cv = 5,
                                   verbose=2,
                                   random_state=42,
                                   n_jobs = -1)
    # Fit the random search model
    mlp_random.fit(X, y)

    # Return optimal parameters
    return mlp_random.best_params_
    
    
def create_estimator(path, params, to_split=True):
    """
    Train the model.
    """
    filename = "models/mlp.pkl"
    
    # Load the training (and testing) set(s)
    if to_split:
        X_train, X_test, y_train, y_test = load_data(path, to_split=to_split)
    else:
        X_train, y_train = load_data(path, to_split=to_split)
        
    # Extract parameters
    hidden_layer_sizes = params['hidden_layer_sizes']
    learning_rate_init = params['learning_rate_init']
    learning_rate = params['learning_rate']
    activation = params['activation']
    momentum = params['momentum']

    with measure_time('Training...'):
        model = MLPClassifier(solver='sgd', 
                                hidden_layer_sizes=hidden_layer_sizes, 
                                early_stopping=True,
                                learning_rate_init=learning_rate_init,
                                learning_rate = learning_rate,
                                activation= activation,
                                momentum= momentum)
        model.fit(X_train, y_train)
        joblib.dump(model, filename) 
        
    y_pred = model.predict(X_train)
    print("=================================================================")
    print("MLP Training set accuracy: {}".format(accuracy_score(y_train, y_pred)))
    print("=================================================================")
    
    if to_split:
        y_pred = model.predict(X_test)
        print("MLP Test set accuracy: {}".format(accuracy_score(y_test, y_pred)))
        print("=================================================================")



# Hypertune model
path = './data/cleaned_HSL_data.csv'
params = tune_hyperparameter(path)
print("Best parameters", params)

# Train model on best parameters
create_estimator(path, params)

Writing ./training_scripts/train_mlp.py


# Run code on Azure

In [53]:
from azureml.train.sklearn import SKLearn
from azureml.core import Experiment


#import the Scikit-learn package 
est = SKLearn(source_directory=folder_training_script,
                compute_target=compute_target,
                entry_script='train_SVC.py',
                conda_packages=['scikit-learn'])

#Create an experiment
experiment = Experiment(workspace = ws, name = "gogreen-experiment")
print('Experiment created')


# Run the experiment
run = experiment.submit(config=est)
run



Experiment created


Experiment,Id,Type,Status,Details Page,Docs Page
gogreen-experiment,gogreen-experiment_1573945753_53351dc1,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation
