In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('output.csv',  index_col=[0])

In [3]:
df.head()

Unnamed: 0,AQI,Category,Defining Parameter,Number of Sites Reporting,city_ascii,state_name,lat,lng,population,density,Year
0,43,Good,Ozone,1,Flagstaff,Arizona,35.1872,-111.6194,79624.0,428.0,2022
1,42,Good,Ozone,2,Flagstaff,Arizona,35.1872,-111.6194,79624.0,428.0,2022
2,43,Good,Ozone,2,Flagstaff,Arizona,35.1872,-111.6194,79624.0,428.0,2022
3,42,Good,Ozone,2,Flagstaff,Arizona,35.1872,-111.6194,79624.0,428.0,2022
4,43,Good,Ozone,2,Flagstaff,Arizona,35.1872,-111.6194,79624.0,428.0,2022


Encoding categorical data: `Defining Parameter`, `city_ascii` and `state_name`.

In [21]:
df['Category'] = df['Category'].astype('category')
df['Category'] = df['Category'].cat.codes
df['Defining Parameter'] = df['Defining Parameter'].astype('category')
df['Defining Parameter'] = df['Defining Parameter'].cat.codes
df['city_ascii'] = df['city_ascii'].astype('category')
df['city_ascii'] = df['city_ascii'].cat.codes
df['state_name'] = df['state_name'].astype('category')
df['state_name'] = df['state_name'].cat.codes
df.head()

Unnamed: 0,AQI,Category,Defining Parameter,Number of Sites Reporting,city_ascii,state_name,lat,lng,population,density,Year
0,43,0,2,1,25,1,35.1872,-111.6194,79624.0,428.0,2022
1,42,0,2,2,25,1,35.1872,-111.6194,79624.0,428.0,2022
2,43,0,2,2,25,1,35.1872,-111.6194,79624.0,428.0,2022
3,42,0,2,2,25,1,35.1872,-111.6194,79624.0,428.0,2022
4,43,0,2,2,25,1,35.1872,-111.6194,79624.0,428.0,2022


### Separar os dados para treino, teste e validação
Aplicaremos a função `train_test_split` duas vezes para obter os três conjuntos: treino, teste e validação.

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
y = df.Category
X = df.drop('Category', axis=1)
train_ratio = 0.7
test_ratio = 0.20
validation_ratio = 0.10
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_ratio, random_state=1)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=validation_ratio/(train_ratio+test_ratio))

In [7]:
# iniciando o MLFlow
import mlflow

In [None]:
mlflow.set_experiment('air-quality-category')

### Treinamentos

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
import numpy as np

In [9]:
def evaluate_model(dt_classifier):
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))

In [13]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [10]:
mlflow.start_run(experiment_id='1', run_name='air-quality-category')

<ActiveRun: >

#### K-nearest neighbors

In [24]:
# create and fit knn
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

In [25]:
y_pred_knn = knn.predict(X_test)

In [39]:
(rmse_knn, mae_knn, r2_knn) = eval_metrics(y_test, y_pred_knn)

In [43]:
mlflow.log_metric("accuracy knn", accuracy_score(y_test, y_pred_knn))
mlflow.log_metric("rmse knn", rmse_knn)
mlflow.log_metric("r2 knn", r2_knn)
mlflow.log_metric("mae knn", mae_knn)

In [28]:
# mostrar métricas da predição do knn
evaluate_model(knn)

Test Accuracy : 0.9950477322695924
Test Confusion Matrix:
[[177384      0    209      0      0      0]
 [     0    138      0      1      0      4]
 [   292      0  73679      0    195      0]
 [     0      0      1   6752    139     55]
 [     0      0    305    139  17438      0]
 [     0      1      0     42      0   2492]]


In [44]:
mlflow.sklearn.log_model(knn, 'knn')

ModelInfo(artifact_path='knn', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.10.1', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.1.2', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/eec8a5454e0043ea81cffaf9aca9545b/knn', model_uuid='a2a1a6910294480781300c25a50f4501', run_id='eec8a5454e0043ea81cffaf9aca9545b', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-09-21 13:49:57.075916', mlflow_version='1.28.0')

#### Naive Bayes

In [33]:
# create and fit naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [34]:
y_pred_gnb = gnb.predict(X_test)

In [42]:
(rmse_gnb, mae_gnb, r2_gnb) = eval_metrics(y_test, y_pred_gnb)

In [45]:
mlflow.log_metric("accuracy Naive Bayes", accuracy_score(y_test, y_pred_gnb))
mlflow.log_metric("rmse Naive Bayes", rmse_gnb)
mlflow.log_metric("r2 Naive Bayes", r2_gnb)
mlflow.log_metric("mae Naive Bayes", mae_gnb)

In [46]:
# mostrar métricas da predição do Naive Bayes
evaluate_model(gnb)

Test Accuracy : 0.6399633324500655
Test Confusion Matrix:
[[170661      0   5995      0      0    937]
 [    29     75      5     13      0     21]
 [ 64545      0   7358      0      0   2263]
 [  5511      0    846      0    265    325]
 [ 14384      0   2945      0      0    553]
 [  1568      0    341      0      0    626]]


In [47]:
mlflow.sklearn.log_model(gnb, 'gnb')

ModelInfo(artifact_path='gnb', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.10.1', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.1.2', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/eec8a5454e0043ea81cffaf9aca9545b/gnb', model_uuid='407f44bf65474168a5f5a7c0b4f36787', run_id='eec8a5454e0043ea81cffaf9aca9545b', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-09-21 13:51:10.446412', mlflow_version='1.28.0')

#### Multi layer perceptron

In [23]:
# create and fit MLP with scikit learn
mlp = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)



In [None]:
#  create and fit MLP with PyTorch

In [None]:
# mostrar métricas da predição do multilayer perceptron
evaluate_model(mlp)

In [13]:
mlflow.sklearn.log_model(mlp, 'mlp')

ModelInfo(artifact_path='mlp', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.10.1', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.1.2', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/7682d5206fe04e67b71520f35d8ab680/mlp', model_uuid='91391f6b6788406faea3e78a9d6256d0', run_id='7682d5206fe04e67b71520f35d8ab680', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-09-11 14:45:42.328790', mlflow_version='1.28.0')

#### Random forest

In [48]:
# create and fit Random Forest
random_forest = RandomForestClassifier(max_depth=4, random_state=0)
random_forest.fit(X_train, y_train)

In [49]:
y_pred_randForest = random_forest.predict(X_test)

In [50]:
(rmse_randForest, mae_randForest, r2_randForest) = eval_metrics(y_test, y_pred_randForest)

In [51]:
mlflow.log_metric("accuracy random forest", accuracy_score(y_test, y_pred_randForest))
mlflow.log_metric("rmse random forest", rmse_randForest)
mlflow.log_metric("r2 random forest", r2_randForest)
mlflow.log_metric("mae random forest", mae_randForest)

In [52]:
# mostrar métricas da predição do Random Forest
evaluate_model(random_forest)

Test Accuracy : 0.9043671624902423
Test Confusion Matrix:
[[177593      0      0      0      0      0]
 [     0      0    143      0      0      0]
 [     0      0  74166      0      0      0]
 [     0      0   6947      0      0      0]
 [     0      0  17082      0    800      0]
 [     0      0   2535      0      0      0]]


In [53]:
mlflow.sklearn.log_model(random_forest, 'random forest')

ModelInfo(artifact_path='random forest', flavors={'python_function': {'model_path': 'model.pkl', 'loader_module': 'mlflow.sklearn', 'python_version': '3.10.1', 'env': 'conda.yaml'}, 'sklearn': {'pickled_model': 'model.pkl', 'sklearn_version': '1.1.2', 'serialization_format': 'cloudpickle', 'code': None}}, model_uri='runs:/eec8a5454e0043ea81cffaf9aca9545b/random forest', model_uuid='70f73593f483423c9b64137e3ea08129', run_id='eec8a5454e0043ea81cffaf9aca9545b', saved_input_example_info=None, signature_dict=None, utc_time_created='2022-09-21 13:59:27.978389', mlflow_version='1.28.0')

In [54]:
mlflow.end_run()

### Seleção de hiper-parâmetros

In [29]:
from sklearn.model_selection import GridSearchCV
import numpy as np

#### KNN

In [1]:
params = {
    'metric': ['euclidean', 'minkowski'],
    'n_neighbors': [2, 3, 5, 7, 10]
}

In [71]:
grid_search = GridSearchCV(estimator=knn, 
                           param_grid=params,
                           cv=4, n_jobs=-1, verbose=1, scoring = 'accuracy')

In [72]:
grid_search.fit(X_valid, y_valid)

Fitting 4 folds for each of 5 candidates, totalling 20 fits


In [73]:
best_parms_knn = grid_search.best_params_

#### Gaussian NB

In [58]:
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import GridSearchCV

params_gnb = {'var_smoothing': np.logspace(0,-9, num=100)}
grid_search_gnb = GridSearchCV(estimator=gnb, 
                 param_grid=params_gnb, 
                 cv=4, 
                 verbose=1, 
                 scoring='accuracy') 
Data_transformed = PowerTransformer().fit_transform(X_valid)
grid_search_gnb.fit(Data_transformed, y_valid);

grid_search_gnb.best_estimator_


  x = um.multiply(x, x, out=x)


Fitting 4 folds for each of 100 candidates, totalling 400 fits


In [61]:
best_params_gnb = grid_search_gnb.best_params_

#### Multi-layer perceptron

In [59]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

* Atenção: o scikit learn não foi indicado para uso com MLP, então vamos trocar pelo PyTorch

In [None]:
# dont work
distributions = {'hidden_layer_sizes': (5,2),
                 'activation': ['identity', 'logistic', 'tanh', 'relu'],
                 'solver': ['lbfgs', 'sgd', 'adam']}
random_search_mlp = RandomizedSearchCV(mlp, distributions, random_state=0)
random_search_mlp.fit(X_valid, y_valid)
random_search_best = random_search_mlp.best_estimator_

#### Random Forest

In [65]:
distributions_2 = {
    'max_depth': [1,2,4, 7, 10]
}

In [66]:
random_search_randForest = RandomizedSearchCV(random_forest, distributions_2, random_state=0)
search_randForest = random_search_randForest.fit(X_valid, y_valid)
search_randForest.best_params_



{'max_depth': 10}

### Selecionar melhor modelo e aplicar otimização de acordo com os melhores hiper-parâmetros

In [94]:
#evaluate_model(colocar aqui o melhor modelo de acordo com os melhores hiper paramtros)