In [1]:
#  Usage:
#    python train_diabetes.py 0.01 0.01
#    python train_diabetes.py 0.01 0.75
#    python train_diabetes.py 0.01 1.0
#

import os
import warnings
import sys

import pandas as pd
import numpy as np
from itertools import cycle
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import lasso_path, enet_path
from sklearn import datasets

import mlflow.sklearn
from  mlflow.tracking import MlflowClient
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']
mlflow_params  = {}
mlflow_metrics = {}

In [2]:
tracking_uri = os.environ.get("TRACKING_URL")
client = MlflowClient(tracking_uri=tracking_uri)
mlflow.set_tracking_uri(tracking_uri)
experiments = client.list_experiments()
experiment_names = []
for exp in experiments:
    experiment_names.append(exp.name)
experiment_name = "nlp_demo"
if experiment_name not in experiment_names:
    mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)



In [3]:
# Load Diabetes datasets
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

print(diabetes)

{'data': array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990842, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06832974, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286377, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04687948,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452837, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00421986,  0.00306441]]), 'target': array([151.,  75., 141., 206., 135.,  97., 138.,  63., 110., 310., 101.,
        69., 179., 185., 118., 171., 166., 144.,  97., 168.,  68.,  49.,
        68., 245., 184., 202., 137.,  85., 131., 283., 129.,  59., 341.,
        87.,  65., 102., 265., 276., 252.,  90., 100.,  55.,  61.,  92.,
       259.,  53., 190., 142.,  75., 142., 155., 225.,  59., 104., 182.,
  

In [4]:
# Create pandas DataFrame for sklearn ElasticNet linear_model
Y = np.array([y]).transpose()
d = np.concatenate((X, Y), axis=1)
cols = diabetes.feature_names + ["progression"]
data = pd.DataFrame(d, columns=cols)


In [5]:
# Evaluate metrics
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [8]:
if __name__ == "__main__":
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "progression" which is a quantitative measure of disease progression one year after baseline
    train_x = train.drop(["progression"], axis=1)
    test_x = test.drop(["progression"], axis=1)
    train_y = train[["progression"]]
    test_y = test[["progression"]]
    
    std = data.std()
    print(std)
    
    
    stat = data.agg({

        "age": ["min", "max", "median", "mean","std"],
        "sex": ["min", "max", "median", "mean","std"],
        "bmi": ["min", "max", "median", "mean","std"],
        "s1": ["min", "max", "median", "mean","std"],
        "s2": ["min", "max", "median", "mean","std"],
        "s3": ["min", "max", "median", "mean","std"],
        "s4": ["min", "max", "median", "mean","std"],
        "s5": ["min", "max", "median", "mean","std"],
        "s6": ["min", "max", "median", "mean","std"],
      }

    )
    
    stat_file = open('statistics.txt', 'w')
    stat_file.write(str(stat))
    stat_file.close()

    
    print(stat)

    #alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.05
    #l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.05
    
    alpha = 0.01
    l1_ratio =  0.01
    

    # Run ElasticNet
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    lr.fit(train_x, train_y)
    predicted_qualities = lr.predict(test_x)
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

    # Print out ElasticNet model metrics
    print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    # Log mlflow attributes for mlflow UI
    mlflow.log_param("alpha", alpha)
    mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)
    mlflow.sklearn.log_model(lr, "model")

    # Compute paths
    eps = 5e-3  # the smaller it is the longer is the path

    print("Computing regularization path using the elastic net.")
    alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=l1_ratio, fit_intercept=False)

    # Display results
    fig = plt.figure(1)
    ax = plt.gca()

    colors = cycle(["b", "r", "g", "c", "k"])
    neg_log_alphas_enet = -np.log10(alphas_enet)
    for coef_e, c in zip(coefs_enet, colors):
        l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle="--", c=c)

    plt.xlabel("-Log(alpha)")
    plt.ylabel("coefficients")
    title = "ElasticNet Path by alpha for l1_ratio = " + str(l1_ratio)
    plt.title(title)
    plt.axis("tight")

    # Save figures
    fig.savefig("ElasticNet-paths.png")

    # Close plot
    plt.close(fig)

    # Log artifacts (output files)
    mlflow.log_artifact("ElasticNet-paths.png")

age             0.047619
sex             0.047619
bmi             0.047619
bp              0.047619
s1              0.047619
s2              0.047619
s3              0.047619
s4              0.047619
s5              0.047619
s6              0.047619
progression    77.093005
dtype: float64
                 age           sex           bmi            s1            s2  \
min    -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.267807e-01 -1.156131e-01   
max     1.107267e-01  5.068012e-02  1.705552e-01  1.539137e-01  1.987880e-01   
median  5.383060e-03 -4.464164e-02 -7.283766e-03 -4.320866e-03 -3.819065e-03   
mean   -3.639623e-16  1.309912e-16 -8.013951e-16 -9.042540e-17  1.301121e-16   
std     4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02  4.761905e-02   

                  s3            s4            s5            s6  
min    -1.023071e-01 -7.639450e-02 -1.260974e-01 -1.377672e-01  
max     1.811791e-01  1.852344e-01  1.335990e-01  1.356118e-01  
median -6.584468e-03 -2.592262e-03