<a href="https://colab.research.google.com/github/anjha1/Azure/blob/main/ML/Universal%20Azure%20ML%20Dataset%20Handling%20Code%20(Training%20Script).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# banknotes_training.py

import os
import argparse
import pandas as pd
import numpy as np
import joblib
import glob
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from azureml.core import Run

# Argument parsing
parser = argparse.ArgumentParser()
parser.add_argument('--regularization', type=float, dest='reg_rate', default=0.01, help='Regularization rate')
parser.add_argument('--dataset-type', type=str, dest='ds_type', default='tabular', help='tabular or file')
parser.add_argument('--dataset-name', type=str, dest='ds_name', default='banknotes', help='Dataset input name')
args = parser.parse_args()

reg = args.reg_rate
ds_type = args.ds_type
ds_name = args.ds_name

run = Run.get_context()
print("Dataset type:", ds_type)

# Load data
if ds_type == 'tabular':
    print("Loading tabular dataset...")
    banknotes = run.input_datasets[ds_name].to_pandas_dataframe()

elif ds_type == 'file':
    print("Loading file dataset...")
    data_path = run.input_datasets[ds_name]
    data_folder = data_path.download() if hasattr(data_path, 'download') else str(data_path)
    csv_files = glob.glob(os.path.join(data_folder, '*.csv'))
    banknotes = pd.concat(pd.read_csv(f) for f in csv_files)

else:
    raise ValueError("Unsupported dataset type. Use 'tabular' or 'file'.")

# Split data
X = banknotes[['var', 'skew', 'kurtosis', 'entropy']].values
y = banknotes['fraudulent'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train model
print("Training model...")
model = LogisticRegression(C=1/reg, solver='liblinear').fit(X_train, y_train)
run.log('Regularization Rate', reg)

# Evaluate
y_pred = model.predict(X_test)
accuracy = np.average(y_pred == y_test)
auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
run.log('Accuracy', accuracy)
run.log('AUC', auc)

print(f'Accuracy: {accuracy:.3f}, AUC: {auc:.3f}')

# Save model
os.makedirs('outputs', exist_ok=True)
joblib.dump(model, 'outputs/banknotes_model.pkl')
run.complete()


🧪 How to Use in Your Estimator

In [None]:
script_params = {
    '--regularization': 0.1,
    '--dataset-type': 'tabular',  # or 'file'
    '--dataset-name': 'banknotes'
}

estimator = SKLearn(
    source_directory=experiment_folder,
    entry_script='banknotes_training.py',
    script_params=script_params,
    compute_target='local',
    inputs=[dataset.as_named_input('banknotes')],
    pip_packages=['azureml-dataprep[pandas]']
)
