In [1]:
import pandas as pd
import numpy as np

df = pd.read_excel('app_data.xlsx')
df_processed = df.copy()
target_variables = ['Diagnosis', 
                    'Management', 
                    'Severity']

In [2]:
#substitui valores NaN com a mediana
numeric_columns = df_processed.select_dtypes(include=np.number).columns
for col in numeric_columns:
    if df_processed[col].isnull().any():
        median_val = df_processed[col].median()
        df_processed[col].fillna(median_val, inplace=True)

#valores bool -> num
boolean_columns = df_processed.select_dtypes(include='bool').columns
for col in boolean_columns:
    df_processed[col] = df_processed[col].astype(int)
    
#valores nulos preenchidos com a moda
categorical_features_for_imputation = df_processed.select_dtypes(include=['object']).columns.tolist()
categorical_features_for_imputation = [col for col in categorical_features_for_imputation if col not in target_variables]

for col in categorical_features_for_imputation:
    if df_processed[col].isnull().any():
        mode_val = df_processed[col].mode()[0] 
        df_processed[col].fillna(mode_val, inplace=True)
        
#colunas categoricas -> num
categorical_features_for_ohe = df_processed.select_dtypes(include=['object']).columns.tolist()
categorical_features_for_ohe = [col for col in categorical_features_for_ohe if col not in target_variables] 

df_processed = pd.get_dummies(df_processed, columns=categorical_features_for_ohe, drop_first=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_processed[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split

#label categorica -> num
kf = KFold(n_splits=5, shuffle=True, random_state=42)

diagnosisModel = RandomForestClassifier(n_estimators=100, random_state=42)
managementModel = RandomForestClassifier(n_estimators=100, random_state=42)
severityModel = RandomForestClassifier(n_estimators=100, random_state=42)

diagLe = LabelEncoder()
managLe = LabelEncoder()
sevLe = LabelEncoder()

In [4]:
df_processed['Diagnosis_enc'] = diagLe.fit_transform(df_processed['Diagnosis'])
df_processed['Management_enc'] = managLe.fit_transform(df_processed['Management'])
df_processed['Severity_enc'] = sevLe.fit_transform(df_processed['Severity'])

encoded_cols = ['Diagnosis_enc', 'Management_enc', 'Severity_enc']

for col in encoded_cols:
    print(f"Value counts for '{col}':")
    print(df_processed[col].value_counts())
    print("-" * 30)

Value counts for 'Diagnosis_enc':
Diagnosis_enc
0    463
1    317
2      2
Name: count, dtype: int64
------------------------------
Value counts for 'Management_enc':
Management_enc
0    483
1    270
2     27
4      1
3      1
Name: count, dtype: int64
------------------------------
Value counts for 'Severity_enc':
Severity_enc
1    662
0    119
2      1
Name: count, dtype: int64
------------------------------


In [5]:
#dropa classes com baixa ocorrencia
rows_to_drop_indices = []
for col in encoded_cols:
    value_counts = df_processed[col].value_counts()
    low_occurrence_encoded_values = value_counts[(value_counts == 1) | (value_counts == 2)].index.tolist()

    if low_occurrence_encoded_values:
        indices_for_this_col = df_processed[df_processed[col].isin(low_occurrence_encoded_values)].index
        rows_to_drop_indices.extend(indices_for_this_col)

rows_to_drop_indices = list(set(rows_to_drop_indices))

#drpando colunas
if rows_to_drop_indices:
    df_processed.drop(rows_to_drop_indices, inplace=True)

for col in encoded_cols:
    print(f"\n{col}:")
    print(df_processed[col].value_counts())


Diagnosis_enc:
Diagnosis_enc
0    463
1    316
Name: count, dtype: int64

Management_enc:
Management_enc
0    483
1    270
2     26
Name: count, dtype: int64

Severity_enc:
Severity_enc
1    660
0    119
Name: count, dtype: int64


In [6]:
#diagnosis model
#variaveis
diag_X = df_processed.drop(target_variables + ['Diagnosis_enc'], axis=1)
#classificadores
diag_y = df_processed['Diagnosis_enc']

diag_X_train, diag_X_test, diag_y_train, diag_y_test = train_test_split(diag_X, diag_y, test_size=0.2, random_state=42, stratify=diag_y)

diagnosis_cv_scores = cross_val_score(diagnosisModel, diag_X, diag_y, cv=kf, scoring='accuracy')

diagnosisModel.fit(diag_X_train, diag_y_train)

In [7]:
#management model 
manag_X = df_processed.drop(target_variables + ['Management_enc'], axis=1)

manag_y = df_processed['Management_enc']

manag_X_train, manag_X_test, manag_y_train, manag_y_test = train_test_split(manag_X, manag_y, test_size=0.2, random_state=42, stratify=manag_y)

manag_cv_scores = cross_val_score(managementModel, manag_X, manag_y, cv=kf, scoring='accuracy')

managementModel.fit(manag_X_train, manag_y_train)

In [8]:
#severity model
sev_X = df_processed.drop(target_variables + ['Severity_enc'], axis=1)

sev_y = df_processed['Severity_enc']

sev_X_train, sev_X_test, sev_y_train, sev_y_test = train_test_split(sev_X, sev_y, test_size=0.2, random_state=42, stratify=sev_y)

severity_cv_scores = cross_val_score(severityModel, sev_X, sev_y, cv=kf, scoring='accuracy')

severityModel.fit(sev_X_train, sev_y_train)

In [None]:
#salvando modelos treinados
import joblib
import os

SAVE_MODEL = 'Models'
if not os.path.exists(SAVE_MODEL):
    os.makedirs(SAVE_MODEL)
    
joblib.dump(diagnosisModel, os.path.join(SAVE_MODEL, 'diagnosis_model.joblib'))
joblib.dump(diagLe, os.path.join(SAVE_MODEL, 'leDiagnosis.joblib'))

joblib.dump(managementModel, os.path.join(SAVE_MODEL, 'management_model.joblib'))
joblib.dump(managLe, os.path.join(SAVE_MODEL, 'leManagement.joblib'))

joblib.dump(severityModel, os.path.join(SAVE_MODEL, 'severity_model.joblib'))
joblib.dump(sevLe, os.path.join(SAVE_MODEL, 'leSeverity.joblib'))

joblib.dump(diag_X.columns.tolist(), os.path.join(SAVE_MODEL, 'trained_X_col_diagnosis.joblib'))
joblib.dump(manag_X.columns.tolist(), os.path.join(SAVE_MODEL, 'trained_X_col_management.joblib'))
joblib.dump(sev_X.columns.tolist(), os.path.join(SAVE_MODEL, 'trained_X_col_severity.joblib'))
joblib.dump(df.columns.tolist(), os.path.join(SAVE_MODEL, 'df_columns_origin.joblib'))

['models\\df_colunas_origin.joblib']