In [1]:
import seaborn as sns
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings

%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Import seaborn
import seaborn as sns

# Apply the default theme
sns.set_theme()

# Load an example dataset
df = sns.load_dataset("penguins")

In [3]:
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,,,,,
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [4]:
df = df.dropna()


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 333 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            333 non-null    object 
 1   island             333 non-null    object 
 2   bill_length_mm     333 non-null    float64
 3   bill_depth_mm      333 non-null    float64
 4   flipper_length_mm  333 non-null    float64
 5   body_mass_g        333 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 20.8+ KB


In [6]:
categorical = df.select_dtypes(["category", "object"]).columns
for cat_col in categorical:
    print(f"{cat_col} : {df[cat_col].nunique()} uniqueness variable(s)")

species : 3 uniqueness variable(s)
island : 3 uniqueness variable(s)
sex : 2 uniqueness variable(s)


In [7]:
numeric = df.select_dtypes(["int", "float"]).columns
for num_col in numeric:
    print(f"{num_col} : {df[num_col].nunique()} uniqueness variable(s)")

bill_length_mm : 163 uniqueness variable(s)
bill_depth_mm : 79 uniqueness variable(s)
flipper_length_mm : 54 uniqueness variable(s)
body_mass_g : 93 uniqueness variable(s)


In [8]:
missing_values_count = df.isnull().sum()

missing_values_count[:7]

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

In [9]:
# let's encode the data
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder() # label encoder

for i in df.columns:
    if df[i].dtype == "object":
        label_encoder.fit_transform(list(df[i].values))
        df[i] = label_encoder.transform(df[i].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[i] = label_encoder.transform(df[i].values)


In [10]:
# get the correlation
df_corr = df.corr()
df_corr

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
species,1.0,-0.622428,0.730548,-0.740346,0.850737,0.750434,0.010964
island,-0.622428,1.0,-0.337179,0.568031,-0.554413,-0.559526,-0.012435
bill_length_mm,0.730548,-0.337179,1.0,-0.228626,0.653096,0.589451,0.344078
bill_depth_mm,-0.740346,0.568031,-0.228626,1.0,-0.577792,-0.472016,0.372673
flipper_length_mm,0.850737,-0.554413,0.653096,-0.577792,1.0,0.872979,0.255169
body_mass_g,0.750434,-0.559526,0.589451,-0.472016,0.872979,1.0,0.424987
sex,0.010964,-0.012435,0.344078,0.372673,0.255169,0.424987,1.0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder




X = df.iloc[:, :-1]
y = df['species']


le = LabelEncoder()
le.fit(y)
y = le.transform(y)

X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.3)


X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)


In [12]:
import mlflow

In [13]:
DB_URI = 'sqlite:///mlrunsdb.db'
mlflow.set_tracking_uri(DB_URI)
tags = {
    "Módulo":"Modelos Produtivos 4",
    "Turma":815,
    "objeto":'pinguins'
}
mlflow.set_experiment(experiment_name='Classificação de especies de pinguins')
mlflow.set_experiment_tags(tags=tags)

In [14]:
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix)


def get_metrics(y_test:list, y_pred:list) -> list:
    ac = accuracy_score(y_test, y_pred)
    pr = precision_score(y_test, y_pred, average='weighted')
    rc = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return [ac, pr, rc, f1]

In [15]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

with mlflow.start_run(
    run_name='Tentativa 1',
    description='Classificando pinguins com knn',
    tags={"version":"v1"}
)as model_run:
    k = 7
    knn = KNeighborsClassifier(n_neighbors=k)

    model = Pipeline([
        ('scaler',StandardScaler()),
        ('knn',knn)
    ])

    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    # mc = confusion_matrix(y_test.values.ravel(), y_pred)
    ac, pr, rc, f1 = get_metrics(y_test, y_pred)

    # mlflow.log_param("k",k)
    # mlflow.log_param("size_train_dataset", len(X_train))
    # mlflow.log_param("size_test_dataset", len(X_test))
    params = {
        "k":k,
        "size_train_dataset": len(X_train),
        "size_test_dataset": len(X_test),
    }

    metrics = {
        "acuracia":ac,
        "precision":pr,
        "recall":rc,
        "f1":f1
    }

    mlflow.log_params(params=params)
    mlflow.log_metrics(metrics=metrics)

    mlflow.sklearn.log_model(model, "model")



In [46]:
import mlflow 

mlflow.set_tracking_uri(uri='http://localhost:5000/')

PATH = 'models:/penguins/Production'

loaded_model = mlflow.sklearn.load_model(PATH)

loaded_model.predict(X_val)

RestException: RESOURCE_DOES_NOT_EXIST: Registered Model with name=penguins not found