In [2]:
import seaborn as sns
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [3]:
df=sns.load_dataset('penguins').dropna()
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [4]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder() # label encoder

for i in df.columns:
    if df[i].dtype == "object":
        label_encoder.fit_transform(list(df[i].values))
        df[i] = label_encoder.transform(df[i].values)

In [14]:
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0
4,0,2,36.7,19.3,193.0,3450.0,0
5,0,2,39.3,20.6,190.0,3650.0,1
...,...,...,...,...,...,...,...
338,2,0,47.2,13.7,214.0,4925.0,0
340,2,0,46.8,14.3,215.0,4850.0,0
341,2,0,50.4,15.7,222.0,5750.0,1
342,2,0,45.2,14.8,212.0,5200.0,0


In [5]:
X_features = df.drop("species", axis=1)
y_labels = df["species"]

X_features.shape, y_labels.shape

((333, 6), (333,))

In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(
    X_features, y_labels, random_state=42, 
    test_size=.25, stratify=y_labels
)

X_train, X_val, y_train, y_val = train_test_split(
    X_features, y_labels, random_state=42, 
    test_size=.25, stratify=y_labels
)

print(f"X_train shape: {X_train.shape} \n"
       f"X_test shape: {X_test.shape} \n"
       f"y_train shape: {y_train.shape} \n"
       f"y_test.shape: {y_test.shape}")

X_train shape: (249, 6) 
X_test shape: (84, 6) 
y_train shape: (249,) 
y_test.shape: (84,)


In [7]:
import mlflow

In [8]:
DB_URI = 'sqlite:///mlrunsdb.db'
mlflow.set_tracking_uri(DB_URI)
tags = {
    "Módulo":"Modelos Produtivos 4 - GridSearchCV",
    "Turma":815,
    "objeto":'pinguins'
}
mlflow.set_experiment(experiment_name='Classificação de especies de pinguins com GridSearchCV')
mlflow.set_experiment_tags(tags=tags)

In [9]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix)


def get_metrics(y_test:list, y_pred:list) -> list:
    ac = accuracy_score(y_test, y_pred)
    pr = precision_score(y_test, y_pred, average='weighted')
    rc = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return [ac, pr, rc, f1]

In [10]:
from sklearn.tree import DecisionTreeClassifier

# Parameter
parameter = {
    "max_depth": [2, 4, 6],
    "max_features": ["auto", "sqrt", "log2"],
    "criterion": ["gini", "entropy", "log_loss"],
    "splitter": ["best", "random"]
}

# models
model = GridSearchCV(DecisionTreeClassifier(random_state=42), 
                     param_grid=parameter, cv=3, n_jobs=-1, verbose=1)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
    # mc = confusion_matrix(y_test.values.ravel(), y_pred)
ac, pr, rc, f1 = get_metrics(y_test, y_pred)

# mlflow.log_param("k",k)
# mlflow.log_param("size_train_dataset", len(X_train))
# mlflow.log_param("size_test_dataset", len(X_test))
params = {
    "size_train_dataset": len(X_train),
    "size_test_dataset": len(X_test),
}

metrics = {
    "acuracia":ac,
    "precision":pr,
    "recall":rc,
    "f1":f1
}

mlflow.log_params(params=params)
mlflow.log_metrics(metrics=metrics)

mlflow.sklearn.log_model(model, "model")

print(f"model train score: {model.score(X_train, y_train)},\n" 
      f"model test score: {model.score(X_test, y_test)},\n" 
      f"model best score: {model.best_score_}")

Fitting 3 folds for each of 54 candidates, totalling 162 fits




model train score: 0.9116465863453815,
model test score: 0.8333333333333334,
model best score: 0.8795180722891566


In [11]:
import mlflow 

mlflow.set_tracking_uri(uri='http://localhost:5000/')

PATH = 'models:/penguins/Production'

loaded_model = mlflow.sklearn.load_model(PATH)
loaded_model.predict(X_val)

Feature names unseen at fit time:
- sex
Feature names seen at fit time, yet now missing:
- species



array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [13]:
loaded_model.score(X_val,y_val)

Feature names unseen at fit time:
- sex
Feature names seen at fit time, yet now missing:
- species



0.35714285714285715