## MLOPS tools: Mlflow

MLflow is an open-source platform for managing the end-to-end machine learning lifecycle. It was developed by Databricks and is now maintained by the Linux Foundation. MLflow provides a set of tools for tracking experiments, packaging code into reproducible runs, and sharing and deploying models.

MLflow is designed to help data scientists and machine learning engineers manage the complexity of the machine learning lifecycle. With MLflow, you can:

Track experiments: MLflow allows you to log parameters, metrics, and artifacts for each experiment, making it easy to reproduce results and compare different models.

Package code: MLflow allows you to package code into reproducible runs, making it easy to share and deploy models across different environments.

Share and deploy models: MLflow provides tools for sharing models with other members of your team, as well as for deploying models to different production environments.

MLflow is built on top of popular machine learning libraries like scikit-learn, TensorFlow, and PyTorch, so you can use your favorite tools and workflows with MLflow. It also integrates with a variety of data storage and compute platforms, including Amazon S3, Azure Blob Storage, and Databricks.

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# Generate some sample data
df = pd.read_csv("data/LoadTimeSeriesData_case1.csv", parse_dates=['timestamp'])
df['power'].interpolate(method='spline', inplace=True, order=3)
df['date'] = df['timestamp'].dt.date
df['hour'] = df['timestamp'].dt.hour
df_matrix = df.pivot(index='date', columns='hour', values='power')

# Set up the mlflow experiment
mlflow.set_experiment("KMeans Clustering")

# Define the clustering parameters
k = 5

# Train the k-means model
with mlflow.start_run(run_name="kmeans_model"):
    
    # Fit the model
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(df_matrix)
    
    # Evaluate the model using the Davies-Bouldin index and Silhouette index
    db_index = davies_bouldin_score(df_matrix, kmeans.labels_)
    sil_score = silhouette_score(df_matrix, kmeans.labels_)
    
    # Log the metrics to mlflow
    mlflow.log_metric("Davies-Bouldin Index", db_index)
    mlflow.log_metric("Silhouette Score", sil_score)
    
    # Log the hyperparameters to mlflow
    mlflow.log_param("k", k)
    
    # Log the model to mlflow
    mlflow.sklearn.log_model(kmeans, "model")


In [None]:
import mlflow
import mlflow.sklearn
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score, silhouette_score
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import calmap
from matplotlib.colors import ListedColormap, Normalize

# Generate some sample data
df = pd.read_csv("data/LoadTimeSeriesData_case1.csv", parse_dates=['timestamp'])
df['power'].interpolate(method='spline', inplace=True, order=3)
df['date'] = df['timestamp'].dt.date
df['hour'] = df['timestamp'].dt.hour
df_matrix = df.pivot(index='date', columns='hour', values='power')

# Set up the mlflow experiment
mlflow.set_experiment("KMeans Clustering")

# Define the clustering parameters
k = 4

# Train the k-means model
with mlflow.start_run(run_name="kmeans_model"):
    
    # Fit the model
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(df_matrix)
    labels = kmeans.labels_
    cluster_counts = np.unique(labels, return_counts=True)
    df['cluster'] = np.repeat(labels, 24)
    centroids = df.groupby(['cluster', 'hour'])['power'].mean().reset_index()
    
    # Evaluate the model using the Davies-Bouldin index and Silhouette index
    db_index = davies_bouldin_score(df_matrix, kmeans.labels_)
    sil_score = silhouette_score(df_matrix, kmeans.labels_)
    
    # Log the metrics to mlflow
    mlflow.log_metric("Davies-Bouldin Index", db_index)
    mlflow.log_metric("Silhouette Score", sil_score)
    
    # Log the hyperparameters to mlflow
    mlflow.log_param("k", k)
    
    # Log the model to mlflow
    mlflow.sklearn.log_model(kmeans, "model")

    #
    # generating load profiles
    g = sns.FacetGrid(data=df, col='cluster', hue='date', col_wrap=3, height=3, aspect=2, sharey=False)
    g.map(sns.lineplot, 'hour', 'power', color='gray')

    # adding average values
    i = 0
    for ax, cluster in zip(g.axes.flatten(), centroids['cluster'].unique()):
        sns.lineplot(x='hour', y='power', data=centroids[centroids['cluster'] == cluster], color='r', ax=ax, label='Profilo medio', legend=False)
        ax.set_ylim(bottom=0, top=df['power'].max())
        ax.set_xticks(range(0, 24))
        ax.grid(True, linestyle='--')

        cluster_counts_str = 'Count: ' + str(cluster_counts[1][i])
        ax.text(0.05, 0.95, cluster_counts_str, transform=ax.transAxes, fontsize=10, verticalalignment='top')
        i += 1

    plt.close()

    mlflow.log_figure(g.fig, 'graph/centroinds.png')

    # calendar visualization
    cal_data = pd.DataFrame({'cluster': labels}, index=pd.to_datetime(df_matrix.index))
    colors = sns.color_palette("husl", k)
    hex_colors = ['#{:02x}{:02x}{:02x}'.format(int(color[0]*255), int(color[1]*255), int(color[2]*255)) for color in colors]
    cmap = ListedColormap(hex_colors)

    fig, ax = calmap.calendarplot(cal_data['cluster'], cmap=cmap, 
                                fillcolor='grey', linewidth=0.5, fig_kws=dict(figsize=(12, 4)), monthticks=3, daylabels='MTWTFSS')

    sm = plt.cm.ScalarMappable(cmap=cmap, norm=Normalize(vmin=0, vmax=k-1))
    sm.set_array([])
    cax = fig.add_axes([0.3, 0.9, 0.4, 0.05])
    cb = plt.colorbar(sm, cax=cax, orientation='horizontal')
    cb.set_label('Cluster')
    cb.set_ticks(cal_data['cluster'].unique())
    cb.set_ticklabels(cal_data['cluster'].unique())

    mlflow.log_figure(fig, 'graph/calendar_map.png')


# plt.show()