In [51]:
import numpy as np
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from unidecode import unidecode
response ="COVID-19 - PESQUISA DE ANTICORPOS IgG COVID-19 IgG"
covidRelated1 = "TESTE RÁPIDO PARA COVID-19 DE PONTA DE DEDO TESTE RÁPIDO PARA COVID-19 DE PONTA DE DEDO"
covidRelated2 = "COVID TESTE LÍQUOR COVID TESTE LÍQUOR"
covidRelated3 = "Teste Rápido para SARS-CoV-2- Pesquisa de anticorpos IgG e IgM (Sorologia para COVID-19) Teste Rápido para SARS-CoV-2- Pesquisa de anticor"

In [44]:
pacientes = pd.read_csv(r'../Data/HC_PACIENTES_1.csv', sep='|')

df = pd.read_csv(r'../Data/HC_EXAMES_1.csv', sep='|')
df = pd.merge(df, pacientes)

df.insert(1, "EXAME", df['DE_EXAME']+" "+df['DE_ANALITO'])

out = df.pivot_table(index=['ID_aTENDIMENTO', 'IC_SEXO', 'AA_NASCIMENTO'], columns='EXAME', values='DE_RESULTADO', aggfunc='first')
out = out.reset_index(level=['IC_SEXO', 'AA_NASCIMENTO'])

In [45]:
le = LabelEncoder()

out[response].mask(out[response].isna() & (out[covidRelated1].notna() | out[covidRelated2].notna() | out[covidRelated3].notna()), "Reagente", inplace=True)

#tirar colunas sem resposta 
out.dropna(subset=[response], inplace=True)
#0 negativo,1 positivo, removemos classe indeterminado
out = out[out[response].isin(["Não reagente", "Reagente"])]

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  out[response].mask(out[response].isna() & (out[covidRelated1].notna() | out[covidRelated2].notna() | out[covidRelated3].notna()), "Reagente", inplace=True)


In [46]:
def try_float(x):
    try:
        return float(x)
    except (ValueError, TypeError):
        return np.nan

mapping = {}
positive_words = ["pos", "numeroso", "positivo", "grande", "detectado", "reagente", "sim", 
                  "presente", "presenca", "maximo", "observado", "s", "numerosas", "normais", 
                  "intenso", "intensa", "positivos", "positivas", "reagentes", "+"]

negative_words = ["neg", "nao", "raro", "rara", "isolados", "inadequada", "zero", 
                 "invalido", "invalido", "ausencia", "ausente", "ausencia", "failed", "indetectavel",
                 "minima", "discreta", "r", "discreto", "negativos", "negativas"]

#intermediate = ['i', '-', '----', '-', '*']

def replace_enconde(x):
    if x is np.nan: return np.nan
    
    if any(text in negative_words for text in unidecode(x.lower()).split(" ")):
        return 0
    elif any(text in positive_words for text in unidecode(x.lower()).split(" ")):
        return 1
    else:
        try:
            return mapping[x]
        except:
            return np.nan

final = pd.DataFrame(out)

def convertNumber(column):
    converted_column = final[column].map(try_float)
    final[column] = converted_column

for column in out.columns[2:]:
    if any(text in ["data", "conferido", "realizado", "detecção:"] for text in column.lower().split(" ")):
        del final[column]
        continue
    
    countings = out[column].value_counts().index.tolist()
    
    encodeList = []
    convertido = False
    for i in range(0, min(2, len(countings))):
        try:
            if "<" in countings[i] or ">" in countings[i]:
                # Apply to each value in the DataFrame
                convertNumber(column)
                convertido = True
                break
                
            float(countings[i])
            # Apply to each value in the DataFrame
            convertNumber(column)
            convertido = True
            break
        except:
            encodeList.append(countings[i])
    
    if not convertido:
        ids = le.fit_transform(encodeList)
        mapping = dict(zip(le.classes_, range(len(le.classes_))))
        converted_column = final[column].map(replace_enconde)
        final[column] = converted_column
        
#Diminuição de colunas com poucos dados
for column in final.columns:
    if final[column].count() < 0.1 * len(final.index):
        del final[column]
        continue

#Preenche todos os valores NaN com o valor pedido
for col in final.columns:
    n_unique = final[col].nunique(dropna=True)
    fill_value = 0.5 if n_unique <= 2 else 0
    final[col] = final[col].fillna(fill_value)

In [49]:
#Atualização dos sexos
final['IC_SEXO'] = le.fit_transform(final['IC_SEXO'])

#Atualização das idades
def calculate_age(born):
    #Checando o valor nulo padrão de datas
    try:
        return 2020 - int(born)
    except: 
        return np.nan

final['AA_NASCIMENTO'] = final['AA_NASCIMENTO'].map(calculate_age)
final['AA_NASCIMENTO'].fillna(final['AA_NASCIMENTO'].median(), inplace=True)

final.to_csv("ProcessedData.csv")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final['AA_NASCIMENTO'].fillna(final['AA_NASCIMENTO'].median(), inplace=True)


In [None]:
y = final[response]
X = final.drop(columns=[response])
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

foldsScore = []
foldStandartDeviation = []
foldAccuracy = []
#f score
foldF1 = []
FoldPrecision = []
#cobertura em portugues
FoldRecall = []
models = []

# treina o modelo para cada fold e guarda a pontuação
for train_index, test_index in kfold.split(X):
    clf = MLPClassifier(hidden_layer_sizes=(150, 50), max_iter=100, random_state=42)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    prediction = clf.predict(X_test)
    foldAccuracy.append(accuracy_score(y_test, prediction))
    foldF1.append(precision_score(y_test, prediction))
    FoldPrecision.append(precision_score(y_test, prediction))
    FoldRecall.append(recall_score(y_test, prediction))
    foldsScore.append(score)
    models.append(clf)

#media respostas
score = np.mean(foldsScore)
standartDeviation = np.std(foldsScore)

print(f"standard deviation: {standartDeviation:.4f}")
print(f"average score: {score:.4f}\n")

foldResults = [FoldRecall, FoldPrecision, foldAccuracy, foldF1]
foldsTitles = ["recall", "precision", "accuracy", "F1"]

for i in range(len(foldResults)):
    best = np.max(foldResults[i])
    max_index = np.argmax(foldResults[i])
    print(f"best {foldsTitles[i]}: {best:.4f}, with index {max_index}")
    worstRecall = np.min(foldResults[i])
    min_index = np.argmin(foldResults[i])
    print(f"worst {foldsTitles[i]}: {worstRecall:.4f}, with index {min_index}\n")

standard deviation: 0.0519
average score: 0.6175

best recall: 0.8649, with index 1
worst recall: 0.5614, with index 3

best precision: 0.8242, with index 0
worst precision: 0.7218, with index 1

best accuracy: 0.6863, with index 4
worst accuracy: 0.5425, with index 3

best F1: 0.8242, with index 0
worst F1: 0.7218, with index 1



In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

# Exemplo
# y_true = [1, 0, 1, 1, 0, 0]
# y_pred = [1, 0, 1, 0, 0, 1]

cm = confusion_matrix(y_true, y_pred)
print("Matriz de confusão:")
print(cm)

disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negativo", "Positivo"])
disp.plot(cmap="Blues")
plt.title("Matriz de Confusão")
plt.show()