# Commet ML Example

In [None]:
# Comet Spesific Imports 
from comet_ml import Experiment, Artifact

In [None]:
# Generic imports
import os
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import pandas as pd 
import numpy as np
from sklearn.utils import shuffle
import seaborn as sns
import matplotlib.pyplot as plt
from pandas import DataFrame
from joblib import dump, load

# Loading dataset
def load_wine_data()-> pd.DataFrame:
    data = load_wine()
    columns=data['feature_names']
    df_X = pd.DataFrame(data=data['data'],columns=columns)
    df_y = pd.DataFrame(data['target'], columns=["class"])
    df = pd.concat([df_X, df_y], axis=1)
    df = df.loc[df['class']!=2]
    df = shuffle(df, random_state=42).reset_index(0).drop("index", axis=1)
    return df

In [None]:
df = load_wine_data()
X = np.array(df.iloc[:,:-1])
y = np.array(df['class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Instantiate Comet Experiment Object

In [None]:
exp = Experiment(api_key = os.environ['COMET_API_KEY'],
                 # Assign project name
                 project_name='ds_example',
                 # enable code logging
                 log_code=True,
                 #only records output made by Python code
                 auto_output_logging='simple',
                 # enables gpu metric tracking
                 log_env_gpu=False,
                 # enables cpu metric tracking
                 log_env_cpu=True,
                 # enable git metatata collection
                 log_git_metadata=True,
                 # log host info
                 log_env_host=True,
                 # automatically tracks the CO2 emission of this experiment 
                 auto_log_co2=True,
                 # hyper paramater logging
                 auto_param_logging=True)

# EDA

In [None]:
def plot_corr_heatmap(df: DataFrame, exp: Experiment)->None:
    plt.figure(figsize=(16, 6))
    mask = np.triu(np.ones_like(df.corr(), dtype=np.bool))
    heatmap = sns.heatmap(df.corr(), mask=mask, vmin=-1, vmax=1, annot=True, cmap='BuPu')
    heatmap.set_title('Triangle Correlation Heatmap', fontdict={'fontsize':18}, pad=16);
    exp.log_figure("corr_heat_map", plt)
    return None

In [None]:
plot_corr_heatmap(df, exp)

In [None]:
def plot_scatter(df: DataFrame, exp:Experiment)->None:
    plt.figure(figsize=(16, 6))
    df_0 = df.loc[df['class']==0]
    df_1 = df.loc[df['class']==1]
    
    x1 = df_0["alcohol"]
    y1 = df_0["color_intensity"]
    
    x2 = df_1["alcohol"]
    y2 = df_1["color_intensity"]
    
    plt.scatter(x1,y1,color="seagreen", label="Class 0", alpha = 0.8)
    plt.scatter(x2,y2,color="purple", label="Class 1", alpha = 0.8)
    plt.grid()
    plt.legend(loc="best")
    plt.xlabel("Alcohol")
    plt.ylabel("Color Intensity")
    plt.title("Alcohol vs Color Intensity")
    
    exp.log_figure("scatter", plt)
    return None

In [None]:
plot_scatter(df, exp)

# Data Prep

In [None]:
scaler = StandardScaler()
X_train_= scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Training & Testing

In [None]:
params = {
    'n_estimators': 50,
     'max_depth': 5
}

In [None]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=params['n_estimators'], max_depth=params['max_depth'])

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)

In [None]:
f1 = metrics.f1_score(y_test, y_pred)
precision =metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
accuracy = metrics.accuracy_score(y_test, y_pred)

print("Accuracy:",accuracy )
print("Recall:",recall)
print("Precision:",precision)
print("f1:",f1)

# Storing Metrics and Paramters

In [None]:
# Store Metrics as dicttionay object 
metrics = {"f1":f1,
"recall":recall,
"precision":precision,
"accuracy": accuracy
}

In [None]:
exp.log_dataset_hash(X_train)
exp.log_parameters(params)
exp.log_metrics(metrics)

# Confusion Matrix

In [None]:
desired_output=y_test
actual_output=y_pred
exp.log_confusion_matrix(desired_output, actual_output);

# Storing Artifacts

In [None]:
dump(clf, 'artifacts/clf.joblib')

In [None]:
artifact = Artifact("rf_model", "model")
artifact.add('artifacts/clf.joblib')
exp.log_artifact(artifact)

### End Experiment 

In [None]:
exp.end()

# ------------------------------------------------------------------------------------------------------------------------------

# DE Example 

In [None]:
import time
from datetime import datetime
from random import randint
from time import sleep

In [None]:
exp = Experiment(api_key = os.environ['COMET_API_KEY'],
                 # Assign project name
                 project_name='de_example',
                 # enable code logging
                 log_code=True,
                 #only records output made by Python code
                 auto_output_logging='simple',
                 # enables gpu metric tracking
                 log_env_gpu=False,
                 # enables cpu metric tracking
                 log_env_cpu=True,
                 # enable git metatata collection
                 log_git_metadata=True,
                 # log host info
                 log_env_host=True,
                 # automatically tracks the CO2 emission of this experiment 
                 auto_log_co2=True,
                 # hyper paramater logging
                 auto_param_logging=True)

# Transformations - for example only!

In [None]:
def wait(df:DataFrame)->DataFrame:
    sleep(randint(3,6))
    return df

def sample(df:DataFrame)->DataFrame:
    sleep(randint(1,2))
    return df.sample(50)

def new_feature(df:DataFrame)->DataFrame:
    df["new_feature"]=df["alcohol"]
    return df

In [None]:
run_date = str(datetime.date(datetime.now()))
start = time.time()

# Extract Data 
df = load_wine_data()
vol_in = len(df)
feats_in = len(df.columns)

# Transform Data 
df = df.pipe(wait)\
       .pipe(sample)\
       .pipe(new_feature)\

# Load (in this case write to disk)
vol_out = len(df)
feats_out = len(df.columns)
dump(df, "df_out.joblib")
end = time.time()
execution_time = end - start

In [None]:
params ={ 
    'source': 'database1.table1',
    'target': 'database2.table2',
    'run_date': run_date,
    'version': 'v0.1'
}

In [None]:
metrics = {
    'volumes_in': vol_in,
    'volumes_out': vol_out,
    'features_in': feats_in,
    'features_out': feats_out,
    'start_time':start,
    'end_time':end,
    'excution_time': execution_time 
}

In [None]:
exp.log_parameters(params)
exp.log_metrics(metrics)

# End Experiment 

In [None]:
exp.end()