In [2]:
# Importar as bibliotecas
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import impyute as impy
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

In [None]:
# Arquivo de treino
data_training = pd.read_csv("./air_system_previous_years.csv")

data_training = data_training.dropna(axis=1)

data_training['class'] = data_training['class'].map({'pos':1, 'neg':0})

features_train = data_training.drop('class', axis=1)
targets_train = data_training['class']

features_train.replace('na', np.nan, inplace=True)

for col in features_train.columns:
    try:
        features_train[col] = pd.to_numeric(features_train[col], errors='coerce')
    except:
        print(f"Could not convert column {col} to numeric.")

missing_percentage = features_train.isnull().sum() / len(features_train) * 100

columns_to_drop = missing_percentage[missing_percentage > 50].index
features_train = features_train.drop(columns_to_drop, axis=1)

features_train.fillna(features_train.mean(), inplace=True)
features_train.interpolate(inplace=True)

targets_train = targets_train[features_train.index]

scaler = StandardScaler()
features_train_scaled = scaler.fit_transform(features_train)

In [None]:
# Arquivo de teste
data_current = pd.read_csv("./air_system_present_year.csv")

data_current = data_current.dropna(axis=1)

data_current['class'] = data_current['class'].map({'pos':1, 'neg':0})

features_current = data_current.drop('class', axis=1)
targets_current = data_current['class']

features_current.replace('na', np.nan, inplace=True)

for col in features_current.columns:
    try:
        features_current[col] = pd.to_numeric(features_current[col], errors='coerce')
    except:
        print(f"Could not convert column {col} to numeric.")

missing_percentage = features_current.isnull().sum() / len(features_current) * 100

columns_to_drop = missing_percentage[missing_percentage > 50].index
features_current = features_current.drop(columns_to_drop, axis=1)

features_current.fillna(features_current.mean(), inplace=True)
features_current.interpolate(inplace=True)

targets_current = targets_current[features_current.index]

features_current_scaled = scaler.transform(features_current)

In [None]:
# Calculo de custo
def calculate_cost(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    total_cost = (fp * 10) + (tp * 25) + (fn * 500)
    return total_cost

cost_scorer = make_scorer(calculate_cost, greater_is_better=False)


In [None]:
# Modelo de machine learning
model = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=cost_scorer, cv=5, n_jobs=-1)
grid_search.fit(features_train_scaled, targets_train)

best_model = grid_search.best_estimator_

targets_pred = best_model.predict(features_current_scaled)

cm = confusion_matrix(targets_current, targets_pred)
tn, fp, fn, tp = cm.ravel()
total_cost = calculate_cost(targets_current, targets_pred)

print("Melhores Hiperparâmetros:", grid_search.best_params_)
print("Confusion Matrix:\n", cm)
print("Custo Total:", total_cost)
print("Classification Report:\n", classification_report(targets_current, targets_pred))


In [None]:
# Calcular a acurácia
accuracy_test = (tp + tn) / (tp + tn + fp + fn)
print("Acurácia do teste:", accuracy_test)