## This sample notebook shows creating an experiment to train a model for classification, the model will be registered and leveraged for inferencing

#### SMS Spam Collection Dataset
Source: https://www.kaggle.com/uciml/sms-spam-collection-dataset


In [1]:
import azureml.core
from azureml.core import Workspace, Dataset

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.38.0 to work with mlopsdev


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [3]:
data = pd.read_csv('./datasets/spamformodel.csv')
inferecing_data = pd.read_csv('./datasets/spamformodel.csv')
data.head(5)

Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.describe()

Unnamed: 0,labels,text
count,3000,3000
unique,2,2851
top,ham,"Sorry, I'll call later"
freq,2591,19


In [5]:
# Create a folder
batch_folder = './batch-data'
os.makedirs(batch_folder, exist_ok=True)
print("Folder created!")

Folder created!


In [6]:
default_ds = ws.get_default_datastore()

#Create a tabular dataset from the path on the datastore (this may take a short while)
default_ds.upload_files(files=['./datasets/spamformodel.csv'], # Upload the diabetes csv files in /data
                        target_path= 'spam-data', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)
    
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'spam-data/spamformodel.csv'))

try:
    tab_data_set = tab_data_set.register(workspace=ws, 
                                        name='email_dataset',
                                        description='email spam or ham data',
                                        tags = {'format':'CSV'},
                                        create_new_version=True)
except Exception as ex:
    print(ex)

# Display the first 20 rows as a Pandas dataframe
tab_data_set.take(20).to_pandas_dataframe()

"datastore.upload_files" is deprecated after version 1.0.69. Please use "FileDatasetFactory.upload_directory" instead. See Dataset API change notice at https://aka.ms/dataset-deprecation.


Uploading an estimated of 1 files
Uploading ./datasets/spamformodel.csv
Uploaded ./datasets/spamformodel.csv, 1 files out of an estimated total of 1
Uploaded 1 files


Unnamed: 0,labels,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [7]:
X = data['text']
Y = data['labels']

In [8]:
count_vectorizer = CountVectorizer()
transformed_vector = count_vectorizer.fit_transform(X)

In [9]:
transformed_vector.shape

(3000, 6245)

In [10]:
#word frequecy
print(transformed_vector[0])

  (0, 2550)	1
  (0, 5784)	1
  (0, 3124)	1
  (0, 4249)	1
  (0, 1641)	1
  (0, 894)	1
  (0, 3977)	1
  (0, 2928)	1
  (0, 1224)	1
  (0, 2604)	1
  (0, 6120)	1
  (0, 3214)	1
  (0, 1222)	1
  (0, 1431)	1
  (0, 5501)	1
  (0, 2581)	1
  (0, 727)	1
  (0, 5955)	1


In [11]:
tfid_transformer = TfidfTransformer() 
tfidf_vector = tfid_transformer.fit_transform(transformed_vector)

In [12]:
#tfidf score per document
print(tfidf_vector[0])

  (0, 6120)	0.2194752986462319
  (0, 5955)	0.19473901783061617
  (0, 5784)	0.22793693965845888
  (0, 5501)	0.15784322212997065
  (0, 4249)	0.2506070193198575
  (0, 3977)	0.16363891047125267
  (0, 3214)	0.28050545509161845
  (0, 3124)	0.31525135382042524
  (0, 2928)	0.11038091843908045
  (0, 2604)	0.18889359811399475
  (0, 2581)	0.15874972350310806
  (0, 2550)	0.1538778038097993
  (0, 1641)	0.26268283838726564
  (0, 1431)	0.2735917909803556
  (0, 1224)	0.2735917909803556
  (0, 1222)	0.2998760486969354
  (0, 894)	0.2542211973750386
  (0, 727)	0.31525135382042524


In [13]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_vector, Y, test_size = 0.2)

In [14]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True) #how many predictions correct %
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    prec = precision_score(y_test, y_pred, average = 'weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print('aacuracy count:', num_acc)
    print('accuracy score:', acc)
    print('precision:', prec)
    print('recall:', recall)

In [15]:
clf = GaussianNB().fit(x_train.toarray(), y_train)

In [16]:
y_pred = clf.predict(x_test.toarray())

In [17]:
summarize_classification(y_test, y_pred)

aacuracy count: 547
accuracy score: 0.9116666666666666
precision: 0.9352229805427899
recall: 0.9116666666666666


## Create Training Script

In [18]:
import os
script_folder = os.path.join(os.getcwd(), "train")
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/devbox/code/Users/babal/EmailClass/email_parallelRun_classificationbatch_pipeline/train


In [19]:
%%writefile $script_folder/classifier_training.py

import argparse
from azureml.core import Run

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import joblib
import os

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt


def summarize_classification(y_test, y_pred, run):
    acc = accuracy_score(y_test, y_pred, normalize=True) #how many predictions correct %
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    prec = precision_score(y_test, y_pred, average = 'weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    run.log('acc count', num_acc)
    run.log('Accuracy', acc)
    run.log('prec', prec)
    run.log('recall', recall)
    
    print('aacuracy count:', num_acc)
    print('accuracy score:', acc)
    print('precision:', prec)
    print('recall:', recall)
    


def getRuntimeArgs():
    # Get script arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
    args = parser.parse_args()

    return args

def model_train(ds_df, run):
    
    X = ds_df['text']
    Y = ds_df['labels']
    #sklearn pipeline
    clf = Pipeline([
                            ('count_vectorizer', CountVectorizer()),
                            ('classifier', LogisticRegression(solver='lbfgs', max_iter=10000))
                        ])
    #output of convectorizer, feed to classifier
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
    print('type of x_test')
    print(type(x_test))
    model = clf.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    print('*************************')
    print('model predictions:')
    print(y_pred)
    summarize_classification(y_test, y_pred, run)

    return model



def main():
    args = getRuntimeArgs()
    
    # Get the experiment run context
    run = Run.get_context()
    
    dataset_dir = './dataset/'
    os.makedirs(dataset_dir, exist_ok=True)
    ws = run.experiment.workspace
    print(ws)
    
    
    print("Loading Data...")
    data = run.input_datasets['training_data'].to_pandas_dataframe()
    
    
    print(data.columns)
    lr = model_train(data, run)
    
    
    # Save the trained model
    model_file = 'email_classifier.pkl'
    joblib.dump(value=lr, filename=model_file)
    run.upload_file(name = 'outputs/' + model_file, path_or_stream = './' + model_file)

    # Complete the run
    run.complete()


    # Register the model
    run.register_model(model_path='outputs/email_classifier.pkl', model_name='email_classifier',
                       tags={'Training context':'spam or ham'})

    #print('Model trained and registered.')
 

if __name__ == "__main__":
    main()

Writing /mnt/batch/tasks/shared/LS_root/mounts/clusters/devbox/code/Users/babal/EmailClass/email_parallelRun_classificationbatch_pipeline/train/classifier_training.py


In [21]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.exceptions import ComputeTargetException
user = 'cpu'
compute_name = user + "-cluster"
print(compute_name)

# checks to see if compute target already exists in workspace, else create it
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
except ComputeTargetException:
    config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D13",
                                                   min_nodes=0, 
                                                   max_nodes=1)

    compute_target = ComputeTarget.create(workspace=ws, name=compute_name, provisioning_configuration=config)
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=40)

cpu-cluster


## Define Environment

In [22]:
%%writefile $script_folder/experiment_env.yml
name: experiment_env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Writing /mnt/batch/tasks/shared/LS_root/mounts/clusters/devbox/code/Users/babal/EmailClass/email_parallelRun_classificationbatch_pipeline/train/experiment_env.yml


In [23]:
from azureml.core import Environment

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("email classification", script_folder + "/experiment_env.yml")

# Let Azure ML manage dependencies
experiment_env.python.user_managed_dependencies = False 

# Print the environment details
print(experiment_env.name, 'defined.')
print(experiment_env.python.conda_dependencies.serialize_to_string())

email classification defined.
name: experiment_env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow



In [24]:
import azureml.core.runconfig
from azureml.core import Environment, Experiment
from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails

# Get the training dataset
email_training_ds = ws.datasets.get('email_dataset')

# Create a script config
script_config = ScriptRunConfig(source_directory=script_folder,
                                script='classifier_training.py',
                                arguments = [
                                             '--input-data', email_training_ds.as_named_input('training_data')], # Reference to dataset
                                environment=experiment_env) 

# submit the experiment
experiment_name = '00_email_classification_model'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 's…

Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{'runId': '00_email_classification_model_1644414207_7d64e672',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2022-02-09T13:43:28.637661Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': '922b361d-8a3f-45c1-83a7-c272fd00c792',
  'azureml.git.repository_uri': 'https://github.com/balakreshnan/email_parallelRun_classificationbatch_pipeline.git',
  'mlflow.source.git.repoURL': 'https://github.com/balakreshnan/email_parallelRun_classificationbatch_pipeline.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': '186a6f3508e212248a2f60eb388da1c36a0ce6f1',
  'mlflow.source.git.commit': '186a6f3508e212248a2f60eb388da1c36a0ce6f1',
  'azureml.git.dirty': 'True'},
 'inputDatasets': [{'dataset': {'id': '0c1e0736-e640-4425-9536-93a7a89d1a6d'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'training_data', 'mechanism': 'Direct'}}],
 'outputDatasets': [],
 'runDefinition': {'scrip

Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms