In [1]:
import azureml.core
from azureml.core import Workspace
import pandas as pd

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.36.0 to work with mlopsdev


In [2]:
df= pd.read_csv('./Data/titanic.csv')
print(df.shape)
print(df.columns)

(891, 12)
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


In [3]:
df['Age'] = df.groupby(['Pclass', 'Sex'])['Age'].apply(lambda x: x.fillna(x.median()))
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [4]:
df['Loc']= df['Cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'X')

In [5]:
df.drop(['Cabin', 'Ticket'], axis=1, inplace=True)

In [6]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Fare,Embarked,Loc
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,X
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,X
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,C
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,X


In [7]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Fare           0
Embarked       2
Loc            0
dtype: int64

In [8]:
df.loc[:,'GroupSize'] = 1 + df['SibSp'] + df['Parch']

In [9]:
df['Embarked'] = df['Embarked'].fillna('S')

In [10]:
LABEL = 'Survived'
columns_to_keep = ['Pclass', 'Sex','Age', 'Fare', 'Embared', 'Deck', 'GroupSize']
columns_to_drop = ['Name','SibSp', 'Parch', 'Survived']
df_train = df
df = df_train.drop(['Name','SibSp', 'Parch', 'PassengerId'], axis=1)

df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Loc,GroupSize
0,0,3,male,22.0,7.25,S,X,2
1,1,1,female,38.0,71.2833,C,C,2
2,1,3,female,26.0,7.925,S,X,1
3,1,1,female,35.0,53.1,S,C,2
4,0,3,male,35.0,8.05,S,X,1


In [11]:
import os
script_folder = os.path.join(os.getcwd(), "train_remote")
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/devbox/code/Users/babal/AMLBook2021/AMLBook2022/Chapter3/train_remote


In [12]:
df.to_csv('./train_remote/titanic.csv')
df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,Fare,Embarked,Loc,GroupSize
0,0,3,male,22.0,7.25,S,X,2
1,1,1,female,38.0,71.2833,C,C,2


In [13]:
from azureml.core import Dataset

#use default datastore retrieved from the workspace through the AML SDK
default_ds = ws.get_default_datastore()


default_ds.upload_files(files=['./train_remote/titanic.csv'], # Upload the diabetes csv files in /data
                        target_path= 'Titanic-data', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)
#Create a tabular dataset from the path on the datastore 
dataset = Dataset.Tabular.from_delimited_files(default_ds.path('Titanic-data/titanic.csv'))

# Register the dataset
try:
    tab_data_set = dataset.register(workspace=ws, 
                                name= 'Titanic-tabular-dataset',
                                description='Tintanic data',
                                tags = {'format':'csv'},
                                create_new_version=True)
    print('Dataset registered.')
except Exception as ex:
        print(ex)

Uploading an estimated of 1 files
Uploading ./train_remote/titanic.csv
Uploaded ./train_remote/titanic.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Dataset registered.


In [14]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = 'c46a9435-c957-4e6c-a0f4-b9a597984773'
resource_group = 'mlops'
workspace_name = 'mlopsdev'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='Titanic-tabular-dataset')
dataset.to_pandas_dataframe()

Unnamed: 0,Column1,Survived,Pclass,Sex,Age,Fare,Embarked,Loc,GroupSize
0,0,0,3,male,22.0,7.2500,S,X,2
1,1,1,1,female,38.0,71.2833,C,C,2
2,2,1,3,female,26.0,7.9250,S,X,1
3,3,1,1,female,35.0,53.1000,S,C,2
4,4,0,3,male,35.0,8.0500,S,X,1
...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,13.0000,S,X,1
887,887,1,1,female,19.0,30.0000,S,B,1
888,888,0,3,female,21.5,23.4500,S,X,4
889,889,1,1,male,26.0,30.0000,C,C,1


In [15]:
from azureml.core.experiment import Experiment
experiment = Experiment(ws, 'titanic_remote_compute')

In [16]:
import os
script_folder = os.path.join(os.getcwd(), "train")
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/devbox/code/Users/babal/AMLBook2021/AMLBook2022/Chapter3/train


In [17]:
%%writefile $script_folder/training.py

import os
import sys
import argparse
import joblib
import pandas as pd
import numpy as np

from azureml.core import Run, Dataset, Workspace, Experiment

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score,roc_curve

# Calculate model performance metrics
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

def getRuntimeArgs():
    parser = argparse.ArgumentParser()
    
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-data", type=str)
    args = parser.parse_args()
    return args

def buildpreprocessorpipeline(X_raw):
    categorical_features = X_raw.select_dtypes(include=['object']).columns
    numeric_features = X_raw.select_dtypes(include=['float','int64']).columns

    categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value="missing")),
                                              ('onehotencoder', OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore'))])
    numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('numeric', numeric_transformer, numeric_features),
            ('categorical', categorical_transformer, categorical_features)
        ], remainder="drop")
    
    return preprocessor

def model_train(LABEL, df, run):  
    y_raw = df[LABEL]
    X_raw = df.drop([LABEL], axis=1)
    
     # Train test split
    X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.3, random_state=0)
    
    lg = LogisticRegression(penalty='l2', C=1.0, solver='liblinear')
    preprocessor = buildpreprocessorpipeline(X_train)
    
    #estimator instance
    clf = Pipeline(steps=[('preprocessor', preprocessor),
                               ('regressor', lg)])

    model = clf.fit(X_train, y_train)
    
    
    # calculate AUC
    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test,y_scores[:,1])
    print('AUC: ' + str(auc))
    run.log('AUC', np.float(auc))

    
    # calculate test accuracy
    y_hat = model.predict(X_test)
    acc = np.average(y_hat == y_test)
    print('Accuracy:', acc)
    run.log('Accuracy', np.float(acc))

    # plot ROC curve
    fpr, tpr, thresholds = roc_curve(y_test, y_scores[:,1])
    fig = plt.figure(figsize=(6, 4))
    # Plot the diagonal 50% line
    plt.plot([0, 1], [0, 1], 'k--')
    # Plot the FPR and TPR achieved by our model
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    run.log_image(name = "ROC", plot = fig)
    plt.show()

    # plot confusion matrix
    # Generate confusion matrix
    cmatrix = confusion_matrix(y_test, y_hat)
    cmatrix_json = {
        "schema_type": "confusion_matrix",
           "schema_version": "v1",
           "data": {
               "class_labels": ["0", "1"],
               "matrix": [
                   [int(x) for x in cmatrix[0]],
                   [int(x) for x in cmatrix[1]]
               ]
           }
    }
    
    run.log_confusion_matrix('ConfusionMatrix_Test', cmatrix_json)

    return model, auc, acc
    # Save the trained model
    
    
def main():
    # Create an Azure ML experiment in your workspace
    args = getRuntimeArgs()
    
    run = Run.get_context()

    dataset_dir = './dataset/'
    os.makedirs(dataset_dir, exist_ok=True)
    ws = run.experiment.workspace
    print(ws)
    

    print("Loading Data...")
    dataset = Dataset.get_by_id(ws, id=args.input_data)
    # Load a TabularDataset & save into pandas DataFrame
    df = dataset.to_pandas_dataframe()
    
    print(df.head(5))
 
    model, auc, acc = model_train('Survived', df, run)
    
    os.makedirs('outputs', exist_ok=True)
    
    
    model_file = os.path.join('outputs', 'titanic_model.pkl')
    joblib.dump(value=model, filename=model_file)
    
    run.upload_file(name='titanic_model.pkl', path_or_stream=model_file)
    
    # Register the model
    print('Registering model...')
    run.register_model(model_path='titanic_model.pkl', model_name= 'titanic-model',
                   tags={'Model Type':'Logistic Regresssion'},
                   properties={'AUC': np.float(auc), 'Accuracy': np.float(acc)})

    run.complete()

if __name__ == "__main__":
    main()

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/devbox/code/Users/babal/AMLBook2021/AMLBook2022/Chapter3/train/training.py


In [18]:
%%writefile $script_folder/experiment_env.yml
name: experiment_env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/devbox/code/Users/babal/AMLBook2021/AMLBook2022/Chapter3/train/experiment_env.yml


In [19]:
from azureml.core import Environment

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification('experiment-env', script_folder + "/experiment_env.yml")

# Let Azure ML manage dependencies
experiment_env.python.user_managed_dependencies = False 

# Print the environment details
print(experiment_env.name, 'defined.')
print(experiment_env.python.conda_dependencies.serialize_to_string())

experiment-env defined.
name: experiment_env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow



In [20]:
import azureml.core.runconfig
from azureml.core import Environment, Experiment
from azureml.core import ScriptRunConfig
from azureml.widgets import RunDetails

# Get the training dataset
titanic_ds = ws.datasets.get('Titanic-tabular-dataset')

# Create a script config
script_config = ScriptRunConfig(source_directory=script_folder,
                                script='training.py',
                                arguments=['--input-data', titanic_ds.as_named_input('titanic')], # Reference to dataset
                                environment=experiment_env) 

# submit the experiment
run = experiment.submit(config=script_config)
RunDetails(run).show()
run.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class SynapseCompute: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


{'runId': 'titanic_remote_compute_1643030740_5938d4b4',
 'target': 'local',
 'status': 'Finalizing',
 'startTimeUtc': '2022-01-24T13:25:41.170199Z',
 'services': {},
 'properties': {'_azureml.ComputeTargetType': 'local',
  'ContentSnapshotId': 'b14da3da-8307-42dd-b852-f895db1d5db0',
  'azureml.git.repository_uri': 'https://github.com/balakreshnan/AMLBook2022.git',
  'mlflow.source.git.repoURL': 'https://github.com/balakreshnan/AMLBook2022.git',
  'azureml.git.branch': 'main',
  'mlflow.source.git.branch': 'main',
  'azureml.git.commit': 'fb0eb505a9199147bba52cf3ee1571aad910ebd2',
  'mlflow.source.git.commit': 'fb0eb505a9199147bba52cf3ee1571aad910ebd2',
  'azureml.git.dirty': 'False'},
 'inputDatasets': [{'dataset': {'id': 'cb7282d6-14b7-41f8-b452-4c0343e764c2'}, 'consumptionDetails': {'type': 'RunInput', 'inputName': 'titanic', 'mechanism': 'Direct'}}, {'dataset': {'id': 'cb7282d6-14b7-41f8-b452-4c0343e764c2'}, 'consumptionDetails': {'type': 'Reference'}}],
 'outputDatasets': [],
 'run