## SETUP

In [9]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
%pip install matplotlib
%pip install mlflow
%pip install --upgrade pip
%pip install dataprep
%pip install --upgrade numpy
%pip install numpy==1.21.0

In [31]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.cluster.hierarchy as sch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score

import mlflow
from math import radians, cos, sin, asin, sqrt
# from dataprep.eda import create_report

# Importação dos dados

In [None]:
sample_url = "water_potability.csv"

sample_data = pd.read_csv(sample_url)

sample_data

## Análise exploratória

In [None]:
# excluindo linhas com valores NaN
df = pd.DataFrame(sample_data)
df.dropna(inplace=True)

df

In [29]:
# Separação de variáveis independentes (X) e dependente (y)
X = df.drop(columns=['Potability'])
y = df['Potability']


In [32]:
# 2. Normalização dos dados
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)


In [33]:
# 3. Divisão dos dados em treino e teste (80/20)
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)


## TREINAMENTO

In [35]:
# 4. Criando um vetor com os modelos recomendados
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(probability=True),
    'Neural Network': MLPClassifier(max_iter=500)}


In [36]:
# Treinamento e avaliação dos modelos
print("\nTreinamento e avaliação dos modelos:\n")
for model_name, model in models.items():
    # Treinamento
    model.fit(X_train, y_train)
    # Previsões
    y_pred = model.predict(X_test)
    # Avaliação
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Modelo: {model_name}")
    print(f"Acurácia: {accuracy:.2f}")
    print(classification_report(y_test, y_pred))
    print("-" * 50)



Treinamento e avaliação dos modelos:

Modelo: Logistic Regression
Acurácia: 0.57
              precision    recall  f1-score   support

           0       0.57      1.00      0.73       231
           1       0.00      0.00      0.00       172

    accuracy                           0.57       403
   macro avg       0.29      0.50      0.36       403
weighted avg       0.33      0.57      0.42       403

--------------------------------------------------
Modelo: Decision Tree
Acurácia: 0.61
              precision    recall  f1-score   support

           0       0.65      0.71      0.68       231
           1       0.55      0.48      0.51       172

    accuracy                           0.61       403
   macro avg       0.60      0.59      0.59       403
weighted avg       0.60      0.61      0.61       403

--------------------------------------------------
Modelo: Random Forest
Acurácia: 0.68
              precision    recall  f1-score   support

           0       0.67      0.87

## VALIDAÇÃO

In [9]:
#---------------------MÉTRICAS---------------------
silhoutte_metric = metrics.silhouette_score(distance_matrix, rotulo_agglomerative.labels_)
print("silhoutte_metric: ", silhoutte_metric)
calinski_harabasz_metric = metrics.calinski_harabasz_score(distance_matrix, rotulo_agglomerative.labels_)
print("calinski_harabasz_metric: ", calinski_harabasz_metric)
davies_bouldin_metric = metrics.davies_bouldin_score(distance_matrix, rotulo_agglomerative.labels_)
print("davies_bouldin_metric: ", davies_bouldin_metric)

silhoutte_metric:  0.625507078703249
calinski_harabasz_metric:  6237.508216167444
davies_bouldin_metric:  0.36036177368683847


## VERSIONAMENTO

In [10]:
from datetime import datetime

now = datetime.now()

current_time = now.strftime("%d/%m/%Y - %H:%M:%S")
current_time

'21/01/2025 - 21:28:09'

In [41]:
mlflow.set_tracking_uri('http://localhost:5001/')
mlflow.set_experiment(f'Water Quality Prediction')


<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1738013808779, experiment_id='1', last_update_time=1738013808779, lifecycle_stage='active', name='Water Quality Prediction', tags={}>

In [42]:

for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Treinamento
        model.fit(X_train, y_train)
        # Previsões
        y_pred = model.predict(X_test)
        # Avaliação
        accuracy = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)

        # Logging de métricas e modelo no MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_dict(report, "classification_report.json")
        mlflow.sklearn.log_model(model, artifact_path="model")

        print(f"Modelo: {model_name}")
        print(f"Acurácia: {accuracy:.2f}")
        print(classification_report(y_test, y_pred))
        print("-" * 50)




Modelo: Gradient Boosting
Acurácia: 0.65
              precision    recall  f1-score   support

           0       0.64      0.86      0.74       231
           1       0.66      0.35      0.46       172

    accuracy                           0.65       403
   macro avg       0.65      0.61      0.60       403
weighted avg       0.65      0.65      0.62       403

--------------------------------------------------
🏃 View run Gradient Boosting at: http://localhost:5001/#/experiments/1/runs/3f4eac9ea54f4a859fd06df4b19789cf
🧪 View experiment at: http://localhost:5001/#/experiments/1




Modelo: K-Nearest Neighbors
Acurácia: 0.63
              precision    recall  f1-score   support

           0       0.65      0.77      0.70       231
           1       0.58      0.44      0.50       172

    accuracy                           0.63       403
   macro avg       0.61      0.60      0.60       403
weighted avg       0.62      0.63      0.61       403

--------------------------------------------------
🏃 View run K-Nearest Neighbors at: http://localhost:5001/#/experiments/1/runs/ad0dc0af76b04f1496935f8b1e84bb1f
🧪 View experiment at: http://localhost:5001/#/experiments/1




Modelo: Support Vector Machine
Acurácia: 0.67
              precision    recall  f1-score   support

           0       0.66      0.90      0.76       231
           1       0.73      0.37      0.49       172

    accuracy                           0.67       403
   macro avg       0.69      0.63      0.63       403
weighted avg       0.69      0.67      0.64       403

--------------------------------------------------
🏃 View run Support Vector Machine at: http://localhost:5001/#/experiments/1/runs/2270d7c5e6994798892fa9fba6d47470
🧪 View experiment at: http://localhost:5001/#/experiments/1




Modelo: Neural Network
Acurácia: 0.69
              precision    recall  f1-score   support

           0       0.70      0.80      0.75       231
           1       0.67      0.55      0.60       172

    accuracy                           0.69       403
   macro avg       0.68      0.67      0.67       403
weighted avg       0.69      0.69      0.68       403

--------------------------------------------------
🏃 View run Neural Network at: http://localhost:5001/#/experiments/1/runs/19c243c4bd564b5cafb833030ed195a3
🧪 View experiment at: http://localhost:5001/#/experiments/1
