## This sample notebook shows creating an experiment to train a model for classification, the model will be registered and leveraged for inferencing

#### SMS Spam Collection Dataset
Source: https://www.kaggle.com/uciml/sms-spam-collection-dataset


In [None]:
import azureml.core
from azureml.core import Workspace, Dataset

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [None]:
data = pd.read_csv('./datasets/spamformodel.csv')
inferecing_data = pd.read_csv('./datasets/spamformodel.csv')
data.head(5)

In [None]:
data.describe()

In [None]:
# Create a folder
batch_folder = './batch-data'
os.makedirs(batch_folder, exist_ok=True)
print("Folder created!")

In [None]:
default_ds = ws.get_default_datastore()

# Register a dataset for the input data
batch_data_set = Dataset.File.from_files(path=(default_ds, 'spam-data-inferencing/'), validate=False)
try:
    batch_data_set = batch_data_set.register(workspace=ws, 
                                             name='spam-batch-data-inference',
                                             description='inference batch data',
                                             create_new_version=True)
except Exception as ex:
    print(ex)

print("Done!")

In [None]:
default_ds = ws.get_default_datastore()

#Create a tabular dataset from the path on the datastore (this may take a short while)
tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'spam-data/spamformodel.csv'))

try:
    tab_data_set = tab_data_set.register(workspace=ws, 
                                        name='email_dataset',
                                        description='email spam or ham data',
                                        tags = {'format':'CSV'},
                                        create_new_version=True)
except Exception as ex:
    print(ex)

# Display the first 20 rows as a Pandas dataframe
tab_data_set.take(20).to_pandas_dataframe()

In [None]:
X = data['text']
Y = data['labels']

In [None]:
count_vectorizer = CountVectorizer()
transformed_vector = count_vectorizer.fit_transform(X)

In [None]:
transformed_vector.shape

In [None]:
#word frequecy
print(transformed_vector[0])

In [None]:
tfid_transformer = TfidfTransformer() 
tfidf_vector = tfid_transformer.fit_transform(transformed_vector)

In [None]:
#tfidf score per document
print(tfidf_vector[0])

In [None]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_vector, Y, test_size = 0.2)

In [None]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True) #how many predictions correct %
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    prec = precision_score(y_test, y_pred, average = 'weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    print('aacuracy count:', num_acc)
    print('accuracy score:', acc)
    print('precision:', prec)
    print('recall:', recall)

In [None]:
clf = GaussianNB().fit(x_train.toarray(), y_train)

In [None]:
y_pred = clf.predict(x_test.toarray())

In [None]:
summarize_classification(y_test, y_pred)

## Create Training Script

In [None]:
import os
script_folder = os.path.join(os.getcwd(), "train")
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

In [None]:
%%writefile $script_folder/classifier_training.py

import argparse
from azureml.core import Run

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import joblib
import os

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt


def summarize_classification(y_test, y_pred, run):
    acc = accuracy_score(y_test, y_pred, normalize=True) #how many predictions correct %
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    prec = precision_score(y_test, y_pred, average = 'weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    run.log('acc count', num_acc)
    run.log('Accuracy', acc)
    run.log('prec', prec)
    run.log('recall', recall)
    
    print('aacuracy count:', num_acc)
    print('accuracy score:', acc)
    print('precision:', prec)
    print('recall:', recall)
    

# def summarize_classification2(model, x_test, y_test, run):
#     y_hat = model.predict(x_test)
#     acc = np.average(y_hat == y_test)
#     print('Accuracy:', acc)
#     run.log('Accuracy', np.float(acc))

    #change labels to 1 and 0's for this to work
    # calculate AUC
#     y_scores = model.predict_proba(x_test)
#     auc = roc_auc_score(y_test,y_scores[:,1])
#     print('AUC: ' + str(auc))
#     run.log('AUC', np.float(auc))

#     # plot ROC curve
#     fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
#     fig = plt.figure(figsize=(6, 4))
#     # Plot the diagonal 50% line
#     plt.plot([0, 1], [0, 1], 'k--')
#     # Plot the FPR and TPR achieved by our model
#     plt.plot(fpr, tpr)
#     plt.xlabel('False Positive Rate')
#     plt.ylabel('True Positive Rate')
#     plt.title('ROC Curve')
#     run.log_image(name = "ROC", plot = fig)
#     plt.show()


def getRuntimeArgs():
    # Get script arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data", type=str, dest='training_dataset_id', help='training dataset')
    args = parser.parse_args()

    return args

def model_train(ds_df, run):
    
    X = ds_df['text']
    Y = ds_df['labels']
    #sklearn pipeline
    clf = Pipeline([
                            ('count_vectorizer', CountVectorizer()),
                            ('classifier', LogisticRegression(solver='lbfgs', max_iter=10000))
                        ])
    #output of convectorizer, feed to classifier
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)
    print('type of x_test')
    print(type(x_test))
    model = clf.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    
    print('*************************')
    print('model predictions:')
    print(y_pred)
    summarize_classification(y_test, y_pred, run)

    return model



def main():
    args = getRuntimeArgs()
    
    # Get the experiment run context
    run = Run.get_context()
    
    dataset_dir = './dataset/'
    os.makedirs(dataset_dir, exist_ok=True)
    ws = run.experiment.workspace
    print(ws)
    
    
    print("Loading Data...")
    data = run.input_datasets['training_data'].to_pandas_dataframe()
    
    
    print(data.columns)
    lr = model_train(data, run)
    
    
    # Save the trained model
    model_file = 'email_classifier.pkl'
    joblib.dump(value=lr, filename=model_file)
    run.upload_file(name = 'outputs/' + model_file, path_or_stream = './' + model_file)

    # Complete the run
    run.complete()


    # Register the model
    run.register_model(model_path='outputs/email_classifier.pkl', model_name='email_classifier',
                       tags={'Training context':'spam or ham'})

    #print('Model trained and registered.')
 

if __name__ == "__main__":
    main()

In [None]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.exceptions import ComputeTargetException
user = 'mm'
compute_name = user + "-cluster"
print(compute_name)

# checks to see if compute target already exists in workspace, else create it
try:
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
except ComputeTargetException:
    config = AmlCompute.provisioning_configuration(vm_size="STANDARD_D13",
                                                   min_nodes=0, 
                                                   max_nodes=1)

    compute_target = ComputeTarget.create(workspace=ws, name=compute_name, provisioning_configuration=config)
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=40)

## Define Environment

In [None]:
%%writefile $script_folder/experiment_env.yml
name: experiment_env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

In [None]:
from azureml.core import Environment

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("email classification", script_folder + "/experiment_env.yml")

# Let Azure ML manage dependencies
experiment_env.python.user_managed_dependencies = False 

# Print the environment details
print(experiment_env.name, 'defined.')
print(experiment_env.python.conda_dependencies.serialize_to_string())

In [None]:
import azureml.core.runconfig
from azureml.core import Environment, Experiment
from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails

# Get the training dataset
email_training_ds = ws.datasets.get('email_dataset')

# Create a script config
script_config = ScriptRunConfig(source_directory=script_folder,
                                script='classifier_training.py',
                                arguments = [
                                             '--input-data', email_training_ds.as_named_input('training_data')], # Reference to dataset
                                environment=experiment_env) 

# submit the experiment
experiment_name = '00_email_classification_model'
experiment = Experiment(workspace=ws, name=experiment_name)
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()